Commit c8d0d447 authored by Adrienne Walker's avatar Adrienne Walker Committed by Commit Bot

Use transfer buffer for small transfer cache entries

OOP-R in RasterImplementation currently allocates the entire free space
available in the transfer buffer so that it doesn't have to measure
first then serialize and can just optimistically serialize into that
space.  This means that other transfer buffer consumers can't use it,
and are instead forced to use mapped memory.  This turns out to not
be very efficient for large numbers of small allocations.

To sidestep this problem, small transfer cache entries are serialized
into the heap when encountered.  Then when raster is complete, it
shrinks the transfer buffer down to the correct size and these small
transfer cache entries are added after it.  Then the commands for these
entries are submitted first before raster.

For example, for three transfer cache tasks (A B C) and one raster (R)
the transfer buffer and ring buffer will look like this:

    transfer buffer ring buffer: R A B C
    command buffer: A B C R

This patch also loosens the restriction on gpu::RingBuffer that there
can be only one in use block at any time.  This leads to potential
exhaustion issues because the ring buffer won't reallocate while there
are in use blocks.  To avoid this, this optimization is only used when
there is room in the ring buffer without waiting.

An alternative to this patch would have been to have yet another
transfer buffer only for the transfer cache or to rewrite oopr
serialization, but both of those are more invasive solutions.

On OSX, on the rendering.desktop telemetry benchmark, on the story
web_animation_value_type_path, this results in the following results:

gpu-r:
  raster cpu time: 2.011ms
  gpu cpu time: 12.891ms
  total time: 14.902ms

oop-r:
  raster cpu time: 7.314ms <- the bug
  gpu cpu time: 3.804ms
  total time: 11.118ms

oop-r + this patch:
  raster cpu time: 0.812ms
  gpu cpu time: 3.901ms
  total time: 4.713ms

Bug: 804380, 877168
Cq-Include-Trybots: luci.chromium.try:android_optional_gpu_tests_rel;luci.chromium.try:linux_optional_gpu_tests_rel;luci.chromium.try:mac_optional_gpu_tests_rel;luci.chromium.try:win_optional_gpu_tests_rel;master.tryserver.blink:linux_trusty_blink_rel
Change-Id: I586cbca2acb13b8de7d3490c5eb5d6b415f6eda5
Reviewed-on: https://chromium-review.googlesource.com/c/1262955
Commit-Queue: enne <enne@chromium.org>
Reviewed-by: default avatarAntoine Labour <piman@chromium.org>
Cr-Commit-Position: refs/heads/master@{#597654}
parent 1c873152
......@@ -24,6 +24,8 @@ class CC_PAINT_EXPORT TransferCacheSerializeHelper {
void AssertLocked(TransferCacheEntryType type, uint32_t id);
virtual void SubmitDeferredEntries() {}
protected:
using EntryKey = std::pair<TransferCacheEntryType, uint32_t>;
......
......@@ -177,7 +177,9 @@ TEST_F(TransferCacheTest, CountEviction) {
EXPECT_EQ(service_cache->cache_size_for_testing(), 20u);
}
TEST_F(TransferCacheTest, RawMemoryTransfer) {
// This tests a size that is small enough that the transfer buffer is used
// inside of RasterImplementation::MapTransferCacheEntry.
TEST_F(TransferCacheTest, RawMemoryTransferSmall) {
auto* service_cache = ServiceTransferCache();
// Create an entry with some initialized data.
......@@ -202,6 +204,33 @@ TEST_F(TransferCacheTest, RawMemoryTransfer) {
EXPECT_EQ(data, service_data);
}
// This tests a size that is large enough that mapped memory is used inside
// of RasterImplementation::MapTransferCacheEntry.
TEST_F(TransferCacheTest, RawMemoryTransferLarge) {
auto* service_cache = ServiceTransferCache();
// Create an entry with some initialized data.
std::vector<uint8_t> data;
data.resize(1500);
for (size_t i = 0; i < data.size(); ++i) {
data[i] = i;
}
// Add the entry to the transfer cache
ClientRawMemoryTransferCacheEntry client_entry(data);
CreateEntry(client_entry);
ri()->Finish();
// Validate service-side data matches.
ServiceTransferCacheEntry* service_entry =
service_cache->GetEntry(gpu::ServiceTransferCache::EntryKey(
decoder_id(), client_entry.Type(), client_entry.Id()));
EXPECT_EQ(service_entry->Type(), client_entry.Type());
const std::vector<uint8_t> service_data =
static_cast<ServiceRawMemoryTransferCacheEntry*>(service_entry)->data();
EXPECT_EQ(data, service_data);
}
TEST_F(TransferCacheTest, ImageMemoryTransfer) {
// TODO(ericrk): This test doesn't work. crbug.com/859619
return;
......
......@@ -13,17 +13,31 @@ ClientTransferCache::~ClientTransferCache() = default;
void* ClientTransferCache::MapEntry(MappedMemoryManager* mapped_memory,
size_t size) {
DCHECK(!mapped_ptr_);
DCHECK(!transfer_buffer_ptr_);
mapped_ptr_.emplace(size, client_->cmd_buffer_helper(), mapped_memory);
if (!mapped_ptr_->valid()) {
mapped_ptr_ = base::nullopt;
return nullptr;
} else {
return mapped_ptr_->address();
}
return mapped_ptr_->address();
}
void* ClientTransferCache::MapTransferBufferEntry(
TransferBufferInterface* transfer_buffer,
size_t size) {
DCHECK(!mapped_ptr_);
DCHECK(!transfer_buffer_ptr_);
transfer_buffer_ptr_.emplace(size, client_->cmd_buffer_helper(),
transfer_buffer);
if (!transfer_buffer_ptr_->valid()) {
transfer_buffer_ptr_ = base::nullopt;
return nullptr;
}
return transfer_buffer_ptr_->address();
}
void ClientTransferCache::UnmapAndCreateEntry(uint32_t type, uint32_t id) {
DCHECK(mapped_ptr_);
DCHECK(mapped_ptr_ || transfer_buffer_ptr_);
EntryKey key(type, id);
base::AutoLock hold(lock_);
......@@ -42,10 +56,20 @@ void ClientTransferCache::UnmapAndCreateEntry(uint32_t type, uint32_t id) {
DCHECK(FindDiscardableHandleId(key).is_null());
discardable_handle_id_map_.emplace(key, discardable_handle_id);
client_->IssueCreateTransferCacheEntry(
type, id, handle.shm_id(), handle.byte_offset(), mapped_ptr_->shm_id(),
mapped_ptr_->offset(), mapped_ptr_->size());
mapped_ptr_ = base::nullopt;
if (mapped_ptr_) {
DCHECK(!transfer_buffer_ptr_);
client_->IssueCreateTransferCacheEntry(
type, id, handle.shm_id(), handle.byte_offset(), mapped_ptr_->shm_id(),
mapped_ptr_->offset(), mapped_ptr_->size());
mapped_ptr_ = base::nullopt;
} else {
DCHECK(!mapped_ptr_);
client_->IssueCreateTransferCacheEntry(
type, id, handle.shm_id(), handle.byte_offset(),
transfer_buffer_ptr_->shm_id(), transfer_buffer_ptr_->offset(),
transfer_buffer_ptr_->size());
transfer_buffer_ptr_ = base::nullopt;
}
}
bool ClientTransferCache::LockEntry(uint32_t type, uint32_t id) {
......
......@@ -13,6 +13,7 @@
#include "gpu/command_buffer/client/gles2_impl_export.h"
#include "gpu/command_buffer/client/gles2_interface.h"
#include "gpu/command_buffer/client/mapped_memory.h"
#include "gpu/command_buffer/client/transfer_buffer.h"
namespace gpu {
class MappedMemoryManager;
......@@ -59,7 +60,10 @@ class GLES2_IMPL_EXPORT ClientTransferCache {
explicit ClientTransferCache(Client* client);
~ClientTransferCache();
// Map(of either type) must always be followed by an Unmap.
void* MapEntry(MappedMemoryManager* mapped_memory, size_t size);
void* MapTransferBufferEntry(TransferBufferInterface* transfer_buffer,
size_t size);
void UnmapAndCreateEntry(uint32_t type, uint32_t id);
bool LockEntry(uint32_t type, uint32_t id);
void UnlockEntries(const std::vector<std::pair<uint32_t, uint32_t>>& entries);
......@@ -72,6 +76,7 @@ class GLES2_IMPL_EXPORT ClientTransferCache {
Client* const client_; // not owned --- client_ outlives this
base::Optional<ScopedMappedMemoryPtr> mapped_ptr_;
base::Optional<ScopedTransferBufferPtr> transfer_buffer_ptr_;
// Access to other members must always be done with |lock_| held.
base::Lock lock_;
......
......@@ -86,6 +86,8 @@ namespace raster {
namespace {
const size_t kMaxTransferCacheEntrySizeForTransferBuffer = 1024;
// Helper to copy data to the GPU service over the transfer cache.
class TransferCacheSerializeHelperImpl
: public cc::TransferCacheSerializeHelper {
......@@ -94,6 +96,15 @@ class TransferCacheSerializeHelperImpl
: support_(support) {}
~TransferCacheSerializeHelperImpl() final = default;
void SubmitDeferredEntries() final {
for (auto& entry : deferred_entries_) {
void* data = support_->MapTransferCacheEntry(entry.size);
memcpy(data, entry.data.get(), entry.size);
support_->UnmapAndCreateTransferCacheEntry(entry.type, entry.id);
}
deferred_entries_.clear();
}
private:
bool LockEntryInternal(const EntryKey& key) final {
return support_->ThreadsafeLockTransferCacheEntry(
......@@ -102,6 +113,14 @@ class TransferCacheSerializeHelperImpl
void CreateEntryInternal(const cc::ClientTransferCacheEntry& entry) final {
size_t size = entry.SerializedSize();
// As an optimization, don't defer entries that are too large, as we will
// always choose mapped memory for them in MapTransferCacheEntry below.
// If we defer but then use mapped memory, it's an unneeded copy.
if (size <= kMaxTransferCacheEntrySizeForTransferBuffer) {
DeferEntry(entry);
return;
}
void* data = support_->MapTransferCacheEntry(size);
if (!data)
return;
......@@ -113,6 +132,8 @@ class TransferCacheSerializeHelperImpl
}
void FlushEntriesInternal(std::set<EntryKey> entries) final {
DCHECK(deferred_entries_.empty());
std::vector<std::pair<uint32_t, uint32_t>> transformed;
transformed.reserve(entries.size());
for (const auto& e : entries)
......@@ -120,8 +141,27 @@ class TransferCacheSerializeHelperImpl
support_->UnlockTransferCacheEntries(transformed);
}
void DeferEntry(const cc::ClientTransferCacheEntry& entry) {
// Because lifetime of the transfer cache entry is transient, we need
// to serialize on the heap, instead of holding onto the entry.
size_t size = entry.SerializedSize();
std::unique_ptr<uint8_t[]> mem(new uint8_t[size]);
bool succeeded = entry.Serialize(base::make_span(mem.get(), size));
DCHECK(succeeded);
deferred_entries_.push_back(
{size, entry.UnsafeType(), entry.Id(), std::move(mem)});
}
ContextSupport* const support_;
struct DeferredEntry {
size_t size;
uint32_t type;
uint32_t id;
std::unique_ptr<uint8_t[]> data;
};
std::vector<DeferredEntry> deferred_entries_;
DISALLOW_COPY_AND_ASSIGN(TransferCacheSerializeHelperImpl);
};
......@@ -173,11 +213,24 @@ class PaintOpSerializer {
// Serialize fonts before sending raster commands.
font_manager_->Serialize();
// Shrink the transfer buffer down to what was written during raster.
ri_->FinalizeRasterCHROMIUM(written_bytes_);
// Create and submit any transfer cache entries that were waiting on raster
// to finish to be written into the transfer buffer.
transfer_cache_helper_->SubmitDeferredEntries();
// Send the raster command itself now that the commands for its
// dependencies have been sent.
ri_->UnmapRasterCHROMIUM(written_bytes_);
// Now that we've issued the RasterCHROMIUM referencing the stashed
// images, Reset the |stashing_image_provider_|, causing us to issue
// unlock commands for these images.
stashing_image_provider_->Reset();
// Unlock all the transfer cache entries used (both immediate and deferred).
transfer_cache_helper_->FlushEntries();
written_bytes_ = 0;
}
......@@ -402,7 +455,21 @@ bool RasterImplementation::ThreadsafeDiscardableTextureIsDeletedForTracing(
}
void* RasterImplementation::MapTransferCacheEntry(size_t serialized_size) {
return transfer_cache_.MapEntry(mapped_memory_.get(), serialized_size);
// Putting small entries in the transfer buffer is an optimization. To avoid
// the case of one client exhausting the entire transfer buffer with transfer
// cache entries, only use this path if this fits without waiting and fall
// back to mapped memory otherwise. Additionally, if we are inside raster,
// then we must use mapped memory because rasterizing requests
// GetTransferBufferFreeSize as the initial size and shrink later during
// FinalizeRasterCHROMIUM.
if (inside_raster_ ||
serialized_size > kMaxTransferCacheEntrySizeForTransferBuffer ||
transfer_buffer_->GetFreeSize() < serialized_size) {
return transfer_cache_.MapEntry(mapped_memory_.get(), serialized_size);
}
return transfer_cache_.MapTransferBufferEntry(transfer_buffer_,
serialized_size);
}
void RasterImplementation::UnmapAndCreateTransferCacheEntry(uint32_t type,
......@@ -1000,6 +1067,8 @@ void* RasterImplementation::MapRasterCHROMIUM(GLsizeiptr size) {
raster_mapped_buffer_ = base::nullopt;
return nullptr;
}
inside_raster_ = true;
return raster_mapped_buffer_->address();
}
......@@ -1028,10 +1097,12 @@ void* RasterImplementation::MapFontBuffer(size_t size) {
return font_mapped_buffer_->address();
}
void RasterImplementation::UnmapRasterCHROMIUM(GLsizeiptr written_size) {
void RasterImplementation::FinalizeRasterCHROMIUM(GLsizeiptr written_size) {
if (written_size < 0) {
SetGLError(GL_INVALID_VALUE, "glUnmapRasterCHROMIUM",
"negative written_size");
raster_mapped_buffer_->Discard();
raster_mapped_buffer_ = base::nullopt;
return;
}
if (!raster_mapped_buffer_) {
......@@ -1045,6 +1116,15 @@ void RasterImplementation::UnmapRasterCHROMIUM(GLsizeiptr written_size) {
return;
}
raster_mapped_buffer_->Shrink(written_size);
inside_raster_ = false;
}
void RasterImplementation::UnmapRasterCHROMIUM(GLsizeiptr written_size) {
if (!raster_mapped_buffer_)
return;
DCHECK_EQ(static_cast<uint32_t>(raster_mapped_buffer_->size()),
static_cast<uint32_t>(written_size))
<< "Call FinalizeRasterCHROMIUM first";
GLuint font_shm_id = 0u;
GLuint font_shm_offset = 0u;
......@@ -1143,6 +1223,9 @@ void RasterImplementation::BeginRasterCHROMIUM(
transfer_cache_serialize_helper.AssertLocked(
cc::TransferCacheEntryType::kColorSpace,
raster_color_space.color_space_id);
// Creating this entry could have been deferred, so submit it so that
// BeginRasterCHROMIUM can depend on it.
transfer_cache_serialize_helper.SubmitDeferredEntries();
helper_->BeginRasterCHROMIUMImmediate(
sk_color, msaa_sample_count, can_use_lcd_text, color_type,
......@@ -1173,6 +1256,8 @@ void RasterImplementation::RasterCHROMIUM(const cc::DisplayItemList* list,
if (offsets.empty())
return;
DCHECK(!inside_raster_);
// TODO(enne): Tune these numbers
// TODO(enne): Convert these types here and in transfer buffer to be size_t.
static constexpr unsigned int kMinAlloc = 16 * 1024;
......@@ -1208,9 +1293,12 @@ void RasterImplementation::RasterCHROMIUM(const cc::DisplayItemList* list,
capabilities().context_supports_distance_field_text,
capabilities().max_texture_size,
capabilities().glyph_cache_max_texture_bytes);
DCHECK(inside_raster_);
serializer.Serialize(&list->paint_op_buffer_, &offsets, preamble);
// TODO(piman): raise error if !serializer.valid()?
op_serializer.SendSerializedData();
DCHECK(!inside_raster_);
}
void RasterImplementation::EndRasterCHROMIUM() {
......
......@@ -164,6 +164,7 @@ class RASTER_EXPORT RasterImplementation : public RasterInterface,
GLuint64* params);
void* MapRasterCHROMIUM(GLsizeiptr size);
void FinalizeRasterCHROMIUM(GLsizeiptr written_size);
void UnmapRasterCHROMIUM(GLsizeiptr written_size);
// ClientFontManager::Client implementation.
......@@ -295,6 +296,8 @@ class RASTER_EXPORT RasterImplementation : public RasterInterface,
mutable base::Lock lost_lock_;
bool lost_;
bool inside_raster_ = false;
struct RasterProperties {
RasterProperties(SkColor background_color,
bool can_use_lcd_text,
......
......@@ -56,8 +56,6 @@ void RingBuffer::FreeOldestBlock() {
void* RingBuffer::Alloc(unsigned int size) {
DCHECK_LE(size, size_) << "attempt to allocate more than maximum memory";
DCHECK(blocks_.empty() || blocks_.back().state != IN_USE)
<< "Attempt to alloc another block before freeing the previous.";
// Similarly to malloc, an allocation of 0 allocates at least 1 byte, to
// return different pointers every time.
if (size == 0) size = 1;
......
......@@ -52,6 +52,8 @@ class GPU_EXPORT RingBuffer {
// Frees a block of memory, pending the passage of a token. That memory won't
// be re-allocated until the token has passed through the command stream.
// If a block is freed out of order, that hole will be counted as used
// in the Get*FreeSize* functions below.
//
// Parameters:
// pointer: the pointer to the memory block to free.
......
......@@ -628,4 +628,79 @@ TEST_F(TransferBufferExpandContractTest, Shrink) {
transfer_buffer_->FreePendingToken(ptr, 1);
}
TEST_F(TransferBufferTest, MultipleAllocsAndFrees) {
// An arbitrary size, but is aligned so no padding needed.
constexpr size_t kArbitrarySize = 16;
Initialize();
size_t original_free_size = transfer_buffer_->GetFreeSize();
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(), original_free_size);
void* ptr1 = transfer_buffer_->Alloc(kArbitrarySize);
EXPECT_NE(ptr1, nullptr);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize);
void* ptr2 = transfer_buffer_->Alloc(kArbitrarySize);
EXPECT_NE(ptr2, nullptr);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize * 2);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize * 2);
void* ptr3 = transfer_buffer_->Alloc(kArbitrarySize);
EXPECT_NE(ptr3, nullptr);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize * 3);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize * 3);
// Generate tokens in order, but submit out of order.
auto token1 = helper_->InsertToken();
auto token2 = helper_->InsertToken();
auto token3 = helper_->InsertToken();
auto token4 = helper_->InsertToken();
// Freeing the final block here, is not perceivable because it's a hole.
transfer_buffer_->FreePendingToken(ptr3, token3);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize * 3);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize * 3);
// Freeing the first block here leaves the second plus a hole after, so
// perceived two blocks not free yet. The free size (no waiting) has not
// changed because the free_offset_ has not moved, but the fragmented free
// size gets bigger because in_use_offset_ has moved past the first block.
transfer_buffer_->FreePendingToken(ptr1, token1);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize * 3);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize * 2);
// Allocate a 4th block. This leaves the state as: Freed Used Freed Used
void* ptr4 = transfer_buffer_->Alloc(kArbitrarySize);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(),
original_free_size - kArbitrarySize * 4);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(),
original_free_size - kArbitrarySize * 3);
// Freeing the second and fourth block makes everything free, so back to
// original size.
transfer_buffer_->FreePendingToken(ptr4, token4);
transfer_buffer_->FreePendingToken(ptr2, token2);
EXPECT_EQ(transfer_buffer_->GetSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFreeSize(), original_free_size);
EXPECT_EQ(transfer_buffer_->GetFragmentedFreeSize(), original_free_size);
}
} // namespace gpu
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment