Commit 6d14d6a8 authored by Dale Curtis's avatar Dale Curtis Committed by Commit Bot

Improve efficiency of GpuVideoDecoder shmem usage.

Previously segments were always pulled from the back of the vector
and only the last value was checked before deciding to create a
new segment.  This allows small buffers on the end to repeatedly
result in shmem creation. 

Instead switch to a first fit algorithm where segments are kept in
sorted order by size. Also switch to choosing the minimum segment
size based on the resolution and clear between flushes.

Since we may be allocating a few more buffers now, the code will
expire unused ones if they haven't been used after 1024 frames.

tl;dr: reductions in memory of ~42% at 360p or below, ~8% at 480p,
~10% at 720p, equivalent at 1080p, ~38% at 2160p. So this should
save us 10s to 100s of KBs on the most common media.

Old results:
360p, dec: 1228217, alloc: 409600 (4), d/a: 2.99858, size: 102400
480p, dec: 3170779, alloc: 536946 (5), d/a: 5.90521, size: 102400
720p, dec: 9402525, alloc: 1191936 (8), d/a: 7.88845, size: 102400
1080p, dec: 16676052, alloc: 2052082 (9), d/a: 8.12641, size: 102400
2160p, dec: 63319373, alloc: 3051283 (18), d/a: 20.7517, size: 102400

New results:
360p, dec: 1228217, alloc: 237568 (6), d/a: 5.16996, size: 32768
480p, dec: 3170779, alloc: 495616 (7), d/a: 6.39765, size: 49152
720p, dec: 9402525, alloc: 1077248 (8), d/a: 8.72828, size: 73728
1080p, dec: 16676052, alloc: 2043904 (9), d/a: 8.15892, size: 98304
2160p, dec: 63319373, alloc: 1888256 (7), d/a: 33.5333, size: 196608

BUG=none
TEST=manual playback of clips with varying resolutions.

Cq-Include-Trybots: master.tryserver.chromium.android:android_optional_gpu_tests_rel;master.tryserver.chromium.linux:linux_optional_gpu_tests_rel;master.tryserver.chromium.mac:mac_optional_gpu_tests_rel;master.tryserver.chromium.win:win_optional_gpu_tests_rel

Change-Id: I6df3dd41320dcc70dfd23224767b107806b0942e
Reviewed-on: https://chromium-review.googlesource.com/565007
Commit-Queue: Dale Curtis <dalecurtis@chromium.org>
Reviewed-by: default avatarDan Sanders <sandersd@chromium.org>
Cr-Commit-Position: refs/heads/master@{#486006}
parent ba4ed7d9
......@@ -40,10 +40,6 @@
namespace media {
namespace {
// Size of shared-memory segments we allocate. Since we reuse them we let them
// be on the beefy side.
static const size_t kSharedMemorySegmentBytes = 100 << 10;
#if defined(OS_ANDROID) && BUILDFLAG(USE_PROPRIETARY_CODECS)
// Extract the SPS and PPS lists from |extra_data|. Each SPS and PPS is prefixed
// with 0x0001, the Annex B framing bytes. The out parameters are not modified
......@@ -82,6 +78,10 @@ const char GpuVideoDecoder::kDecoderName[] = "GpuVideoDecoder";
// resources.
enum { kMaxInFlightDecodes = 4 };
// Number of bitstream buffers returned before GC is attempted on shared memory
// segments. Value chosen arbitrarily.
enum { kBufferCountBeforeGC = 1024 };
struct GpuVideoDecoder::PendingDecoderBuffer {
PendingDecoderBuffer(std::unique_ptr<base::SharedMemory> s,
const scoped_refptr<DecoderBuffer>& b,
......@@ -120,6 +120,8 @@ GpuVideoDecoder::GpuVideoDecoder(
supports_deferred_initialization_(false),
requires_texture_copy_(false),
cdm_id_(CdmContext::kInvalidCdmId),
min_shared_memory_segment_size_(0),
bitstream_buffer_id_of_last_gc_(0),
weak_factory_(this) {
DCHECK(factories_);
}
......@@ -244,6 +246,24 @@ void GpuVideoDecoder::Initialize(const VideoDecoderConfig& config,
VideoDecodeAccelerator::Capabilities::SUPPORTS_DEFERRED_INITIALIZATION);
output_cb_ = output_cb;
// Attempt to choose a reasonable size for the shared memory segments based on
// the size of video. These values are chosen based on experiments with common
// videos from the web. Too small and you'll end up creating too many segments
// too large and you end up wasting significant amounts of memory.
const int height = config.coded_size().height();
if (height >= 4000) // ~4320p
min_shared_memory_segment_size_ = 384 * 1024;
else if (height >= 2000) // ~2160p
min_shared_memory_segment_size_ = 192 * 1024;
else if (height >= 1000) // ~1080p
min_shared_memory_segment_size_ = 96 * 1024;
else if (height >= 700) // ~720p
min_shared_memory_segment_size_ = 72 * 1024;
else if (height >= 400) // ~480p
min_shared_memory_segment_size_ = 48 * 1024;
else // ~360p or less
min_shared_memory_segment_size_ = 32 * 1024;
if (config.is_encrypted() && !supports_deferred_initialization_) {
DVLOG(1) << __func__
<< " Encrypted stream requires deferred initialialization.";
......@@ -745,21 +765,40 @@ void GpuVideoDecoder::ReusePictureBuffer(int64_t picture_buffer_id) {
std::unique_ptr<base::SharedMemory> GpuVideoDecoder::GetSharedMemory(
size_t min_size) {
DCheckGpuVideoAcceleratorFactoriesTaskRunnerIsCurrent();
if (available_shm_segments_.empty() ||
available_shm_segments_.back()->mapped_size() < min_size) {
size_t size_to_allocate = std::max(min_size, kSharedMemorySegmentBytes);
// CreateSharedMemory() can return NULL during Shutdown.
return factories_->CreateSharedMemory(size_to_allocate);
auto it = std::lower_bound(available_shm_segments_.begin(),
available_shm_segments_.end(), min_size,
[](const ShMemEntry& entry, const size_t size) {
return entry.first->mapped_size() < size;
});
if (it != available_shm_segments_.end()) {
auto ret = std::move(it->first);
available_shm_segments_.erase(it);
return ret;
}
auto ret = std::move(available_shm_segments_.back());
available_shm_segments_.pop_back();
return ret;
return factories_->CreateSharedMemory(
std::max(min_shared_memory_segment_size_, min_size));
}
void GpuVideoDecoder::PutSharedMemory(
std::unique_ptr<base::SharedMemory> shared_memory) {
std::unique_ptr<base::SharedMemory> shared_memory,
int32_t last_bitstream_buffer_id) {
DCheckGpuVideoAcceleratorFactoriesTaskRunnerIsCurrent();
available_shm_segments_.push_back(std::move(shared_memory));
available_shm_segments_.emplace(std::move(shared_memory),
last_bitstream_buffer_id);
if (next_bitstream_buffer_id_ < bitstream_buffer_id_of_last_gc_ ||
next_bitstream_buffer_id_ - bitstream_buffer_id_of_last_gc_ >
kBufferCountBeforeGC) {
base::EraseIf(available_shm_segments_, [this](const ShMemEntry& entry) {
// Check for overflow rollover...
if (next_bitstream_buffer_id_ < entry.second)
return next_bitstream_buffer_id_ > kBufferCountBeforeGC;
return next_bitstream_buffer_id_ - entry.second > kBufferCountBeforeGC;
});
bitstream_buffer_id_of_last_gc_ = next_bitstream_buffer_id_;
}
}
void GpuVideoDecoder::NotifyEndOfBitstreamBuffer(int32_t id) {
......@@ -774,7 +813,7 @@ void GpuVideoDecoder::NotifyEndOfBitstreamBuffer(int32_t id) {
return;
}
PutSharedMemory(std::move(it->second.shared_memory));
PutSharedMemory(std::move(it->second.shared_memory), id);
it->second.done_cb.Run(state_ == kError ? DecodeStatus::DECODE_ERROR
: DecodeStatus::OK);
bitstream_buffers_in_decoder_.erase(it);
......@@ -811,6 +850,10 @@ void GpuVideoDecoder::NotifyFlushDone() {
DCHECK_EQ(state_, kDrainingDecoder);
state_ = kDecoderDrained;
base::ResetAndReturn(&eos_decode_cb_).Run(DecodeStatus::OK);
// Assume flush is for a config change, so drop shared memory segments in
// anticipation of a resize occurring.
available_shm_segments_.clear();
}
void GpuVideoDecoder::NotifyResetDone() {
......
......@@ -8,12 +8,14 @@
#include <stddef.h>
#include <stdint.h>
#include <deque>
#include <list>
#include <map>
#include <set>
#include <utility>
#include <vector>
#include "base/containers/flat_set.h"
#include "base/macros.h"
#include "base/memory/weak_ptr.h"
#include "gpu/command_buffer/common/sync_token.h"
......@@ -116,11 +118,12 @@ class MEDIA_EXPORT GpuVideoDecoder
void DestroyVDA();
// Request a shared-memory segment of at least |min_size| bytes. Will
// allocate as necessary.
// allocate as necessary. May return nullptr during Shutdown.
std::unique_ptr<base::SharedMemory> GetSharedMemory(size_t min_size);
// Return a shared-memory segment to the available pool.
void PutSharedMemory(std::unique_ptr<base::SharedMemory> shm_buffer);
void PutSharedMemory(std::unique_ptr<base::SharedMemory> shm_buffer,
int32_t last_bitstream_buffer_id);
// Destroy all PictureBuffers in |buffers|, and delete their textures.
void DestroyPictureBuffers(PictureBufferMap* buffers);
......@@ -176,10 +179,20 @@ class MEDIA_EXPORT GpuVideoDecoder
VideoDecoderConfig config_;
// Shared-memory buffer pool. Since allocating SHM segments requires a
// round-trip to the browser process, we keep allocation out of the
// steady-state of the decoder.
std::vector<std::unique_ptr<base::SharedMemory>> available_shm_segments_;
// Shared-memory buffer pool. Since allocating SHM segments requires a round-
// trip to the browser process, we try to keep allocation out of the steady-
// state of the decoder.
//
// The second value in the ShMemEntry is the last bitstream buffer id assigned
// to that segment; it's used to erase segments which are no longer active.
using ShMemEntry = std::pair<std::unique_ptr<base::SharedMemory>, int32_t>;
class ShMemEntrySortedBySize {
public:
bool operator()(const ShMemEntry& lhs, const ShMemEntry& rhs) const {
return lhs.first->mapped_size() < rhs.first->mapped_size();
}
};
base::flat_set<ShMemEntry, ShMemEntrySortedBySize> available_shm_segments_;
// Placeholder sync token that was created and validated after the most
// recent picture buffers were created.
......@@ -235,6 +248,16 @@ class MEDIA_EXPORT GpuVideoDecoder
// encrypted content.
int cdm_id_;
// Minimum size for shared memory segments. Ideally chosen to optimize the
// number of segments and total size of allocations over the course of a
// playback. See Initialize() for more details.
size_t min_shared_memory_segment_size_;
// |next_bitstream_buffer_id_| at the time we last performed a GC of no longer
// used ShMemEntry objects in |available_shm_segments_|. Updated whenever
// PutSharedMemory() performs a GC.
int32_t bitstream_buffer_id_of_last_gc_;
// Bound to factories_->GetMessageLoop().
// NOTE: Weak pointers must be invalidated before all other member variables.
base::WeakPtrFactory<GpuVideoDecoder> weak_factory_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment