Commit aa7a6a96 authored by Miguel Casas's avatar Miguel Casas Committed by Commit Bot

media/gpu/vaapi: batch submitting VABufferIDs for VP9 decoding

Decoding a bitstream in VA has two steps: one, submitting the parsed
parameters and encoded chunk, and two, executing the decode. For the
first part, ToT VaapiWrapper submits every piece of data individually,
acquiring and releasing |va_lock_| every time. This is unnecessary,
so this CL refactors the SubmitBuffer() method into a new
SubmitBuffer_Locked() and adds a SubmitBuffers() to bundle a couple
of submissions together.

This is verified via chrome:tracing and codepen.io/full/RwarYvG that
plays 4 1280x572 VP9 videos at the same time. Tracing is captured for
a few seconds, results are summarised in [1,2], basically: The total
decode CPU time doesn't change much on neither kohaku nor Braswell(reks)
but batch-submitting takes less time, specially on BSW from ~3x0.089ms=
~0.267ms to ~0.236ms [3], so about 10%. Having less contention in the
Lock makes for an ancillary reduction in Execute_Locked() from 3.295 to
3.165ms.

Improvements are of course extremely small, the advantages of this CL
are in reducing lock/unlock churn and associated contention. This
benefit grows with the amount of decodes (e.g. Meet grid scenarios).

Later CLs will migrate the other decoders, and possibly avoid the call
to vaCreateBuffer() that takes a good 50% of the SubmitBuffer/s() time.

[1] Kohaku w/o patch https://imgur.com/a/nVuE0Nk,
[2] Kohaku with patch https://imgur.com/a/xhdbqHn
[3] VP9 ToT calls SubmitBuffer 3 times per incoming encoded buffer.

Bug: b/166646505
Change-Id: I1b8e36bb1d7107b5367b0b41137e2dc6625e1569
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2393629
Commit-Queue: Miguel Casas <mcasas@chromium.org>
Reviewed-by: default avatarHirokazu Honda <hiroh@chromium.org>
Reviewed-by: default avatarAndres Calderon Jaramillo <andrescj@chromium.org>
Cr-Commit-Position: refs/heads/master@{#805300}
parent 83b6b15e
......@@ -1778,33 +1778,20 @@ bool VaapiWrapper::SyncSurface(VASurfaceID va_surface_id) {
bool VaapiWrapper::SubmitBuffer(VABufferType va_buffer_type,
size_t size,
const void* buffer) {
DCHECK_LT(va_buffer_type, VABufferTypeMax);
DCHECK(buffer);
const void* data) {
TRACE_EVENT0("media,gpu", "VaapiWrapper::SubmitBuffer");
base::AutoLock auto_lock(*va_lock_);
TRACE_EVENT0("media,gpu", "VaapiWrapper::SubmitBufferLocked");
return SubmitBuffer_Locked({va_buffer_type, size, data});
}
VABufferID buffer_id;
{
TRACE_EVENT0("media,gpu", "VaapiWrapper::SubmitBuffer_vaCreateBuffer");
const VAStatus va_res =
vaCreateBuffer(va_display_, va_context_id_, va_buffer_type, size, 1,
nullptr, &buffer_id);
VA_SUCCESS_OR_RETURN(va_res, VaapiFunctions::kVACreateBuffer, false);
bool VaapiWrapper::SubmitBuffers(
const std::vector<VABufferDescriptor>& va_buffers) {
TRACE_EVENT0("media,gpu", "VaapiWrapper::SubmitBuffers");
base::AutoLock auto_lock(*va_lock_);
for (const VABufferDescriptor& va_buffer : va_buffers) {
if (!SubmitBuffer_Locked(va_buffer))
return false;
}
ScopedVABufferMapping mapping(
va_lock_, va_display_, buffer_id,
base::BindOnce(base::IgnoreResult(&vaDestroyBuffer), va_display_));
if (!mapping.IsValid())
return false;
// TODO(selcott): Investigate potentially faster alternatives to memcpy here
// such as libyuv::CopyX and family.
memcpy(mapping.data(), buffer, size);
pending_va_buffers_.push_back(buffer_id);
return true;
}
......@@ -2465,4 +2452,39 @@ bool VaapiWrapper::Execute_Locked(VASurfaceID va_surface_id) {
return true;
}
bool VaapiWrapper::SubmitBuffer_Locked(const VABufferDescriptor& va_buffer) {
TRACE_EVENT0("media,gpu", "VaapiWrapper::SubmitBuffer_Locked");
va_lock_->AssertAcquired();
DCHECK_LT(va_buffer.type, VABufferTypeMax);
DCHECK(va_buffer.data);
unsigned int va_buffer_size;
if (!base::CheckedNumeric<size_t>(va_buffer.size)
.AssignIfValid(&va_buffer_size)) {
return false;
}
VABufferID buffer_id;
{
TRACE_EVENT0("media,gpu",
"VaapiWrapper::SubmitBuffer_Locked_vaCreateBuffer");
const VAStatus va_res =
vaCreateBuffer(va_display_, va_context_id_, va_buffer.type,
va_buffer_size, 1, nullptr, &buffer_id);
VA_SUCCESS_OR_RETURN(va_res, VaapiFunctions::kVACreateBuffer, false);
}
ScopedVABufferMapping mapping(
va_lock_, va_display_, buffer_id,
base::BindOnce(base::IgnoreResult(&vaDestroyBuffer), va_display_));
if (!mapping.IsValid())
return false;
memcpy(mapping.data(), va_buffer.data, va_buffer.size);
pending_va_buffers_.push_back(buffer_id);
return true;
}
} // namespace media
......@@ -299,22 +299,27 @@ class MEDIA_GPU_EXPORT VaapiWrapper
// between contexts.
bool SyncSurface(VASurfaceID va_surface_id);
// Submit parameters or slice data of |va_buffer_type|, copying them from
// |buffer| of size |size|, into HW codec. The data in |buffer| is no
// longer needed and can be freed after this method returns.
// Data submitted via this method awaits in the HW codec until
// ExecuteAndDestroyPendingBuffers() is called to execute or
// DestroyPendingBuffers() is used to cancel a pending job.
bool SubmitBuffer(VABufferType va_buffer_type,
size_t size,
const void* buffer);
// Calls SubmitBuffer_Locked() to request libva to allocate a new VABufferID
// of |va_buffer_type| and |size|, and to copy the |data| into it. The
// allocated VABufferIDs stay alive until DestroyPendingBuffers_Locked(). Note
// that this method does not submit the buffers for execution, they are simply
// stored until ExecuteAndDestroyPendingBuffers()/Execute_Locked(). The
// ownership of |data| stays with the caller.
bool SubmitBuffer(VABufferType va_buffer_type, size_t size, const void* data);
// Convenient templatized version of SubmitBuffer() where |size| is deduced to
// be the size of the type of |*buffer|.
// be the size of the type of |*data|.
template <typename T>
bool SubmitBuffer(VABufferType va_buffer_type, const T* buffer) {
return SubmitBuffer(va_buffer_type, sizeof(T), buffer);
bool SubmitBuffer(VABufferType va_buffer_type, const T* data) {
return SubmitBuffer(va_buffer_type, sizeof(T), data);
}
// Batch-version of SubmitBuffer(), where the lock for accessing libva is
// acquired only once.
struct VABufferDescriptor {
VABufferType type;
size_t size;
const void* data;
};
bool SubmitBuffers(const std::vector<VABufferDescriptor>& va_buffers);
// Submit a VAEncMiscParameterBuffer of given |misc_param_type|, copying its
// data from |buffer| of size |size|, into HW codec. The data in |buffer| is
......@@ -441,6 +446,11 @@ class MEDIA_GPU_EXPORT VaapiWrapper
void DestroyPendingBuffers_Locked() EXCLUSIVE_LOCKS_REQUIRED(va_lock_);
// Requests libva to allocate a new VABufferID of type |va_buffer.type|, maps
// it and copies |va_buffer.size| contents of |va_buffer.data| to it.
bool SubmitBuffer_Locked(const VABufferDescriptor& va_buffer)
EXCLUSIVE_LOCKS_REQUIRED(va_lock_);
const CodecMode mode_;
// Pointer to VADisplayState's member |va_lock_|. Guaranteed to be valid for
......
......@@ -108,9 +108,6 @@ bool VP9VaapiVideoDecoderDelegate::SubmitDecode(
DCHECK((pic_param.profile == 0 && pic_param.bit_depth == 8) ||
(pic_param.profile == 2 && pic_param.bit_depth == 10));
if (!vaapi_wrapper_->SubmitBuffer(VAPictureParameterBufferType, &pic_param))
return false;
VASliceParameterBufferVP9 slice_param;
memset(&slice_param, 0, sizeof(slice_param));
slice_param.slice_data_size = frame_hdr->frame_size;
......@@ -141,12 +138,14 @@ bool VP9VaapiVideoDecoderDelegate::SubmitDecode(
seg_param.chroma_ac_quant_scale = seg.uv_dequant[i][1];
}
if (!vaapi_wrapper_->SubmitBuffer(VASliceParameterBufferType, &slice_param))
return false;
if (!vaapi_wrapper_->SubmitBuffer(VASliceDataBufferType,
frame_hdr->frame_size, frame_hdr->data))
if (!vaapi_wrapper_->SubmitBuffers(
{{VAPictureParameterBufferType,
sizeof(VADecPictureParameterBufferVP9), &pic_param},
{VASliceParameterBufferType, sizeof(VASliceParameterBufferVP9),
&slice_param},
{VASliceDataBufferType, frame_hdr->frame_size, frame_hdr->data}})) {
return false;
}
return vaapi_wrapper_->ExecuteAndDestroyPendingBuffers(
pic->AsVaapiVP9Picture()->va_surface()->id());
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment