Commit fc1767a1 authored by Daniele Castagna's avatar Daniele Castagna Committed by Commit Bot

vaapi: Improve VAAPI encoding scheduling

VAAPI usage in Chrome has a global lock that is acquired for each
vaapi call.

The assumption was that vaapi calls would not block on the CPU,
but they do. In particular, vaEndPicture seems to take a significant
amount of wall time during encoding.

When multiple encodings are happening, we might end up interleaving
parts of the encodings of different streams and serialize them.
This causes wall time for one encoding to sometimes take into account
wall time of other encodings too.

This CL tries to mitigate that issue in a few different ways:
- In VaapiVideoEncodeAccelerator::ReturnBitstreamBuffer the client is
  notified that the encoding is ready before destroying the surface,
  since destroying the surface reacquires the lock that might be taken
  by another vaapi call in another thread.
- ExecuteAndDestroyPendingBuffers acquires the lock only once, so
  that another vaapi client can block in between execute and destroy.
- DownloadFromVABuffer doesn't unlock and relock during the copy to
  shmem, since the copy takes in the order of .1ms for a VGA stream
  and gives the chances to another VAAPI client to acquire the lock
  once the encoding is done and before we notify the client.

Bug: 1005592
Test:
  on meet.google.com, record average WebRTC encoding times,
  observe it drops by ~10%

Change-Id: I807e281ba501b5d05d53902c8257010725730966
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1819518
Commit-Queue: Daniele Castagna <dcastagna@chromium.org>
Reviewed-by: default avatarHirokazu Honda <hiroh@chromium.org>
Reviewed-by: default avatarMiguel Casas <mcasas@chromium.org>
Cr-Commit-Position: refs/heads/master@{#701010}
parent 6edb3bd2
......@@ -505,7 +505,7 @@ void VaapiVideoEncodeAccelerator::ReturnBitstreamBuffer(
uint8_t* target_data = static_cast<uint8_t*>(buffer->shm->memory());
size_t data_size = 0;
if (!vaapi_wrapper_->DownloadAndDestroyVABuffer(
if (!vaapi_wrapper_->DownloadFromVABuffer(
encode_job->coded_buffer_id(), encode_job->input_surface()->id(),
target_data, buffer->shm->size(), &data_size)) {
NOTIFY_ERROR(kPlatformFailureError, "Failed downloading coded buffer");
......@@ -519,6 +519,8 @@ void VaapiVideoEncodeAccelerator::ReturnBitstreamBuffer(
child_task_runner_->PostTask(
FROM_HERE, base::BindOnce(&Client::BitstreamBufferReady, client_,
buffer->id, encode_job->Metadata(data_size)));
vaapi_wrapper_->DestroyVABuffer(encode_job->coded_buffer_id());
}
void VaapiVideoEncodeAccelerator::Encode(scoped_refptr<VideoFrame> frame,
......
......@@ -1653,8 +1653,12 @@ bool VaapiWrapper::SubmitVAEncMiscParamBuffer(
void VaapiWrapper::DestroyPendingBuffers() {
TRACE_EVENT0("media,gpu", "VaapiWrapper::DestroyPendingBuffers");
base::AutoLock auto_lock(*va_lock_);
TRACE_EVENT0("media,gpu", "VaapiWrapper::DestroyPendingBuffersLocked");
DestroyPendingBuffers_Locked();
}
void VaapiWrapper::DestroyPendingBuffers_Locked() {
TRACE_EVENT0("media,gpu", "VaapiWrapper::DestroyPendingBuffers_Locked");
va_lock_->AssertAcquired();
for (const auto& pending_va_buf : pending_va_bufs_) {
VAStatus va_res = vaDestroyBuffer(va_display_, pending_va_buf);
VA_LOG_ON_ERROR(va_res, "vaDestroyBuffer failed");
......@@ -1670,8 +1674,9 @@ void VaapiWrapper::DestroyPendingBuffers() {
}
bool VaapiWrapper::ExecuteAndDestroyPendingBuffers(VASurfaceID va_surface_id) {
bool result = Execute(va_surface_id);
DestroyPendingBuffers();
base::AutoLock auto_lock(*va_lock_);
bool result = Execute_Locked(va_surface_id);
DestroyPendingBuffers_Locked();
return result;
}
......@@ -1823,8 +1828,11 @@ bool VaapiWrapper::DownloadFromVABuffer(VABufferID buffer_id,
auto* buffer_segment =
reinterpret_cast<VACodedBufferSegment*>(mapping.data());
// memcpy calls should be fast, unlocking and relocking for unmapping might
// cause another thread to acquire the lock and we'd have to wait delaying the
// notification that the encode is done.
{
base::AutoUnlock auto_unlock(*va_lock_);
TRACE_EVENT0("media,gpu", "VaapiWrapper::DownloadFromVABufferCopyEncoded");
*coded_data_size = 0;
while (buffer_segment) {
......@@ -1871,21 +1879,12 @@ bool VaapiWrapper::GetVAEncMaxNumOfRefFrames(VideoCodecProfile profile,
return true;
}
bool VaapiWrapper::DownloadAndDestroyVABuffer(VABufferID buffer_id,
VASurfaceID sync_surface_id,
uint8_t* target_ptr,
size_t target_size,
size_t* coded_data_size) {
bool result = DownloadFromVABuffer(buffer_id, sync_surface_id, target_ptr,
target_size, coded_data_size);
void VaapiWrapper::DestroyVABuffer(VABufferID buffer_id) {
base::AutoLock auto_lock(*va_lock_);
VAStatus va_res = vaDestroyBuffer(va_display_, buffer_id);
VA_LOG_ON_ERROR(va_res, "vaDestroyBuffer failed");
const auto was_found = va_buffers_.erase(buffer_id);
DCHECK(was_found);
return result;
}
void VaapiWrapper::DestroyVABuffers() {
......@@ -2186,7 +2185,12 @@ void VaapiWrapper::DestroySurface(VASurfaceID va_surface_id) {
bool VaapiWrapper::Execute(VASurfaceID va_surface_id) {
TRACE_EVENT0("media,gpu", "VaapiWrapper::Execute");
base::AutoLock auto_lock(*va_lock_);
TRACE_EVENT0("media,gpu", "VaapiWrapper::ExecuteLocked");
return Execute_Locked(va_surface_id);
}
bool VaapiWrapper::Execute_Locked(VASurfaceID va_surface_id) {
TRACE_EVENT0("media,gpu", "VaapiWrapper::Execute_Locked");
va_lock_->AssertAcquired();
DVLOG(4) << "Pending VA bufs to commit: " << pending_va_bufs_.size();
DVLOG(4) << "Pending slice bufs to commit: " << pending_slice_bufs_.size();
......
......@@ -351,13 +351,8 @@ class MEDIA_GPU_EXPORT VaapiWrapper
size_t target_size,
size_t* coded_data_size);
// See DownloadFromVABuffer() for details. After downloading, it deletes
// the VA buffer with |buffer_id|.
bool DownloadAndDestroyVABuffer(VABufferID buffer_id,
VASurfaceID sync_surface_id,
uint8_t* target_ptr,
size_t target_size,
size_t* coded_data_size);
// Deletes the VA buffer identified by |buffer_id|.
void DestroyVABuffer(VABufferID buffer_id);
// Destroy all previously-allocated (and not yet destroyed) buffers.
void DestroyVABuffers();
......@@ -410,6 +405,10 @@ class MEDIA_GPU_EXPORT VaapiWrapper
// if vaapi driver refuses to accept parameter or slice buffers submitted
// by client, or if execution fails in hardware.
bool Execute(VASurfaceID va_surface_id);
bool Execute_Locked(VASurfaceID va_surface_id)
EXCLUSIVE_LOCKS_REQUIRED(va_lock_);
void DestroyPendingBuffers_Locked() EXCLUSIVE_LOCKS_REQUIRED(va_lock_);
// Attempt to set render mode to "render to texture.". Failure is non-fatal.
void TryToSetVADisplayAttributeToLocalGPU();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment