an article I wrote before "SkeyeRTSPLive Traditional Video Surveillance Internet + Implementing Sharp Tool Solutions" As mentioned in the RTSP to RTMP transfer process, the simplified process is to pull the RTSP stream through the SkeyeRTSPClient to obtain the audio and video encoding data, and then push it out through the SkeyeRTMPPusher. The process is very simple; then in the actual development process, we found that this process is actually not It is not as simple as imagined; first, the RTSP protocol supports a variety of audio and video encoding formats, such as audio supports AAC,G711, G726, etc., video supports H264, H625, MJPEG, MPEG and other formats, while SkeyeRTMPPusher only supports H264 (extended to support H265) format, at this time, audio can be transcoded into AAC format through SkeyeAACEncoder, and video can be decoded into original data through SkeyeVideoDecoder, and then through SkeyeVideoEncoder to transcode the original data into RTMP push specified Format, in this article, we will focus on the decoding process of SkeyeVideoDecoder based on Nvidia (NVIDIA) discrete graphics card.
SkeyeVideoDecoder is based on the hardware decoding library of Nvidia discrete graphics card SkeyeNvDecoder
The SkeyeNvDecoder library is a hardware decoding program based on Nvidia's independent graphics card driver. The decoding program is very efficient and has powerful parallel decoding capabilities. Its decoding efficiency is at least 5-6 times higher than that of ffmpeg software decoding. The latest RTX series graphics cards are also The decoding efficiency is even 10-12 times higher than that of soft decoding, and it is easy to decode multiple 4K or even 8K high-definition videos without pressure. This article uses the latest graphics card driver so far (20190714), and the CUDA version needs to be supported by version 10.0 or above.
1. The interface declaration is as follows:
#ifndef SKEYENVDECODERAPI_H #define SKEYENVDECODERAPI_H #include <string> //++ typedefine start #ifndef SKEYENVDECODER_HANDLE #define SKEYENVDECODER_HANDLE void* #endif//SKEYENVDECODER_HANDLE typedef enum _OutputFormat //native=The default decoder output is NV12 format { native = 0, bgrp, rgbp, bgra, rgba, bgra64, rgba64 }OutputFormat; typedef enum _SKEYENvDecoder_CodecType { SKEYENvDecoder_Codec_MPEG1 = 0, /**< MPEG1 */ SKEYENvDecoder_Codec_MPEG2, /**< MPEG2 */ SKEYENvDecoder_Codec_MPEG4, /**< MPEG4 */ SKEYENvDecoder_Codec_VC1, /**< VC1 */ SKEYENvDecoder_Codec_H264, /**< H264 */ SKEYENvDecoder_Codec_JPEG, /**< JPEG */ SKEYENvDecoder_Codec_H264_SVC, /**< H264-SVC */ SKEYENvDecoder_Codec_H264_MVC, /**< H264-MVC */ SKEYENvDecoder_Codec_HEVC, /**< HEVC */ SKEYENvDecoder_Codec_VP8, /**< VP8 */ SKEYENvDecoder_Codec_VP9, /**< VP9 */ SKEYENvDecoder_Codec_NumCodecs, /**< Max codecs */ } SKEYENvDecoder_CodecType; typedef enum _SKEYENvDecoder_YUVType { // Uncompressed YUV SKEYENvDecoder_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0) */ SKEYENvDecoder_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,V,U (4:2:0) */ SKEYENvDecoder_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,UV (4:2:0) */ SKEYENvDecoder_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')), /**< YUYV/YUY2 (4:2:2) */ SKEYENvDecoder_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y')) /**< UYVY (4:2:2) */ } SKEYENvDecoder_YUVType; #ifdef __cplusplus extern "C" { #endif int SKEYENvDecoder_Initsize(std::string &erroStr); //Do not use this flag bLowLatency unless using low latency mode, but it is difficult to get 100% utilization of the hardware decoder with this flag. SKEYENVDECODER_HANDLE NvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, std::string &erroStr); int NvDecoder_Decode(NVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned); void SKEYENvDecoder_Release(NVDECODER_HANDLE handle) ; int NvDecoder_Uninitsize(); #ifdef __cplusplus } #endif #endif // SKEYENVDECODERAPI_H
2. SkeyeNvDecoder decoding library call process
The first step is to initialize the registered decoder
Note that to register the decoder function globally, you only need to call one;int SKEYENvDecoder_Initsize(string &erroStr) { try { if (!isInitsized) { //The graphics card is only initialized once ck(cuInit(0)); int nGpu = 0; ck(cuDeviceGetCount(&nGpu)); for (int i = 0; i < nGpu; i++) { CUdevice cuDevice = 0; ck(cuDeviceGet(&cuDevice, i)); char szDeviceName[128]; ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice)); LOG(INFO) << "Find Gpu: " << szDeviceName << std::endl; CUcontext cuContext = NULL; ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice)); m_ctxV.push_back({ cuContext,szDeviceName }); } isInitsized = true; m_curIndex = 0; } if (m_ctxV.empty()) { return -1; } } catch (const std::exception& ex) { erroStr = ex.what(); std::cout << ex.what(); return -2; } return 1; }
- The second step is to create a decoder instance
SKEYENVDECODER_HANDLE SKEYENvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, string &erroStr) { //if (!isInitsized || !m_ctxV.size()) { // return NULL; //} try { ck(cuInit(0)); int nGpu = 0; ck(cuDeviceGetCount(&nGpu)); CUcontext cuContext = NULL; m_curIndex++; m_curIndex = (m_curIndex) % nGpu; for (int i = 0; i < nGpu; i++) { if (m_curIndex == i) { CUdevice cuDevice = 0; ck(cuDeviceGet(&cuDevice, i)); char szDeviceName[128]; ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice)); LOG(INFO) << "Find Gpu: " << szDeviceName << std::endl; ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice)); } } //std::pair<CUcontext, std::string> &v = m_ctxV.at(m_curIndex++ % m_ctxV.size()); //std::cout << "Use Contex in " << v.second << std::endl; const char *aszChromaFormat[] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" }; cudaVideoCodec aeCodec[] = { cudaVideoCodec_JPEG, cudaVideoCodec_MPEG1, cudaVideoCodec_MPEG2, cudaVideoCodec_MPEG4, cudaVideoCodec_H264, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_VC1, cudaVideoCodec_VP8, cudaVideoCodec_VP9, cudaVideoCodec_VP9, cudaVideoCodec_VP9 }; int anBitDepthMinus8[] = { 0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 4, 0, 0, 0, 2, 4 }; cudaVideoChromaFormat aeChromaFormat[] = { cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420 }; CUVIDDECODECAPS videoDecodeCaps = {}; videoDecodeCaps.eCodecType = (cudaVideoCodec)codec; videoDecodeCaps.eChromaFormat = cudaVideoChromaFormat_420; videoDecodeCaps.nBitDepthMinus8 = 0; for (int i = 0; i < sizeof(aeCodec) / sizeof(aeCodec[0]); i++) { if (aeCodec[i] == codec) { videoDecodeCaps.eChromaFormat = aeChromaFormat[i]; videoDecodeCaps.nBitDepthMinus8 = anBitDepthMinus8[i]; break; } } errCode = cuvidGetDecoderCaps(&videoDecodeCaps); if (CUDA_SUCCESS == errCode) { //Determine whether the graphics card supports 1080p decoding LOG(INFO) << "cuvid Decoder Caps nMaxWidth " << videoDecodeCaps.nMaxWidth << " nMaxHeigth " << videoDecodeCaps.nMaxHeight << std::endl; if (!videoDecodeCaps.bIsSupported) { erroStr = "Codec not supported on this GPU Decoder"; errCode = -1; } else { //Determine whether to support specified format resolution video decoding if (videoDecodeCaps.nMaxWidth >= videoW && videoDecodeCaps.nMaxHeight >= videoH) { NvDecoder* pDecoder = new NvDecoder(/*v.first*/cuContext, videoW, videoH, eOutputFormat== native?false:true, (cudaVideoCodec)codec, NULL, bLowLatency, eOutputFormat); pDecoder->Start(); return pDecoder; } else { erroStr = "Width and height not supported on this GPU Decoder"; errCode = -2; } } } } catch (std::exception &e) { erroStr = e.what(); } return NULL; }
- The third step is to call the decoding function to decode
int SKEYENvDecoder_Decode(SKEYENVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned) { if (!handle) return -1; NvDecoder* pDecoder = (NvDecoder*)handle; int anSize[] = { 0, 3, 3, 4, 4, 8, 8 }; //std::unique_ptr<uint8_t[]> pImage(new uint8_t[nFrameSize]); std::vector<uint8_t *>* vecOutBuffer = pDecoder->GetFrameBufferVector(); size_t nFrameSize = pDecoder->GetOutFrameSize(); *pnFrameLen = nFrameSize; int nFrameReturned = 0, nFrame = 0; uint8_t **ppFrame = NULL; bool bLowLatency = pDecoder->IsSetLowLatency(); bool bSuc = pDecoder->Decode(pData, nSize, &ppFrame, &nFrameReturned, CUVID_PKT_ENDOFPICTURE/*bLowLatency?CUVID_PKT_ENDOFPICTURE : 0*/); if (!bSuc) return -2; //if (!nFrame && nFrameReturned > 0) //LOG(INFO) << "nFrameReturned = " <<nFrameReturned;//pDecoder->GetVideoInfo(); for (int i = 0; i < nFrameReturned; i++) { if (native != pDecoder->GetSetOutputFormat()) { if (i >= (*vecOutBuffer).size()) { (*vecOutBuffer).push_back(new uint8_t[nFrameSize]); } } if (pDecoder->GetBitDepth() == 8) { switch (pDecoder->GetSetOutputFormat()) { case native: //GetImage((CUdeviceptr)ppFrame[i], (*vecOutBuffer)[i], pDecoder->GetWidth(), pDecoder->GetHeight() + (pDecoder->GetChromaHeight() * pDecoder->GetNumChromaPlanes())); break; case bgrp: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight()); break; case rgbp: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight()); break; case bgra: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight()); break; case rgba: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight()); break; case bgra64: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight()); break; case rgba64: if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444) YUV444ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); else Nv12ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight()); GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight()); break; } } } nFrame += nFrameReturned; if (nFrameReturned > 0) { if (pnFrameReturned) *pnFrameReturned = nFrameReturned; if (native != pDecoder->GetSetOutputFormat()) { if (pppFrame && (*vecOutBuffer).size() > 0) *pppFrame = &(*vecOutBuffer)[0]; } else { if (pppFrame && ppFrame) *pppFrame = ppFrame; } } }
- The fourth step, stop decoding and destroy the decoder
void SKEYENvDecoder_Release(SKEYENVDECODER_HANDLE handle) { if (!handle) return; NvDecoder* pDecoder = (NvDecoder*)handle; pDecoder->Stop(); delete pDecoder; m_curIndex--; if (m_curIndex < 0) m_curIndex = 0; }
- The fifth step is to log out the decoder and release resources
int SKEYENvDecoder_Uninitsize() { isInitsized = false; for (int nI = 0; nI < m_ctxV.size(); nI++) { cuCtxDestroy(m_ctxV[nI].first); } m_ctxV.clear(); m_curIndex = 0; return 1; }
Since then, the packaging of SKEYENvDecoder has been completed. We can call Nvidia’s graphics card through its interface to perform hardware decoding test. The following is the real application effect. The 12-channel rendering of the hard solution shows that the cpu I5 accounts for 11, and the 730 graphics card points 75-80, as follows As shown in the figure:
If you have any technical questions, welcome to communicate with me:
295222688@qq.com
You can also join the SkeyePlayer streaming media player QQ group for discussion:
102644504