RELAND2: Vaapi decode: split |decoder_|s GetRequiredNumOfPictures()

This CL is a smart-rebase-and-fix of the CLs below: it introduces a new parameter |use_reduced_number_of_allocations_| to allow for the new working mode described below and to temporarily circumvent the GtsExoPlayerTestCases failures (b/121169667 and b/121003733); this new flag is false when |output_mode_| is IMPORT, so all ARC++ cases should work bc left untouched. OTOH a mem savings overview can be found at https://goo.gl/3PaMiA. Original CL description ----------------------------------------------- Vaapi decode: split |decoder_|s GetRequiredNumOfPictures() This CL reduces the amount of PictureBuffers requested to be allocated by the |client_| when we are not |decode_using_client_picture_buffers_|. Instead, it "splits" the requested allocations into - the actual needed PictureBuffers (A) - the codec's requested reference frames (B) (a new method GetNumReferenceFrames() is added to AcceleratedVideoDecoder for this). This splitting saves a lot of memory, since we allocate A+B buffers instead of 2*(A+B). (B is 5 and A is 4-VP8, 4-12 H264/VP9) Test: crosvideo changing resolutions for each codec, v_d_a_unittest on nocturne (KBL) and caroline (SKL). Bug: 912295 Reviewed-on: https://chromium-review.googlesource.com/c/1363807Reviewed-by: Hirokazu Honda <hiroh@chromium.org> Commit-Queue: Miguel Casas <mcasas@chromium.org> Cr-Original-Original-Commit-Position: refs/heads/master@{#615571} Reviewed-on: https://chromium-review.googlesource.com/c/1379274 Cr-Original-Commit-Position: refs/heads/master@{#617366} Change-Id: Ibf9a1455f8df4d52b77aee8e01f15c02878947ae Reviewed-on: https://chromium-review.googlesource.com/c/1387391 Cr-Commit-Position: refs/heads/master@{#620025}

RELAND2: Vaapi decode: split |decoder_|s GetRequiredNumOfPictures()
This CL is a smart-rebase-and-fix of the CLs below: it introduces a new parameter |use_reduced_number_of_allocations_| to allow for the new working mode described below and to temporarily circumvent the GtsExoPlayerTestCases failures (b/121169667 and b/121003733); this new flag is false when |output_mode_| is IMPORT, so all ARC++ cases should work bc left untouched. OTOH a mem savings overview can be found at https://goo.gl/3PaMiA. Original CL description ----------------------------------------------- Vaapi decode: split |decoder_|s GetRequiredNumOfPictures() This CL reduces the amount of PictureBuffers requested to be allocated by the |client_| when we are not |decode_using_client_picture_buffers_|. Instead, it "splits" the requested allocations into - the actual needed PictureBuffers (A) - the codec's requested reference frames (B) (a new method GetNumReferenceFrames() is added to AcceleratedVideoDecoder for this). This splitting saves a lot of memory, since we allocate A+B buffers instead of 2*(A+B). (B is 5 and A is 4-VP8, 4-12 H264/VP9) Test: crosvideo changing resolutions for each codec, v_d_a_unittest on nocturne (KBL) and caroline (SKL). Bug: 912295 Reviewed-on: https://chromium-review.googlesource.com/c/1363807Reviewed-by: Hirokazu Honda <hiroh@chromium.org> Commit-Queue: Miguel Casas <mcasas@chromium.org> Cr-Original-Original-Commit-Position: refs/heads/master@{#615571} Reviewed-on: https://chromium-review.googlesource.com/c/1379274 Cr-Original-Commit-Position: refs/heads/master@{#617366} Change-Id: Ibf9a1455f8df4d52b77aee8e01f15c02878947ae Reviewed-on: https://chromium-review.googlesource.com/c/1387391 Cr-Commit-Position: refs/heads/master@{#620025}
477a706a · Miguel Casas · Commit Bot · 5e0d370c · 477a706a · 477a706a
Commit 477a706a authored Jan 04, 2019 by Miguel Casas Committed by Commit Bot Jan 04, 2019
10 changed files
--- a/media/gpu/accelerated_video_decoder.h
+++ b/media/gpu/accelerated_video_decoder.h
@@ -66,11 +66,13 @@ class MEDIA_GPU_EXPORT AcceleratedVideoDecoder {
  // we need a new set of them, or when an error occurs.
  virtual DecodeResult Decode() WARN_UNUSED_RESULT = 0;

-  // Return dimensions/required number of output surfaces that client should
-  // be ready to provide for the decoder to function properly.
-  // To be used after Decode() returns kAllocateNewSurfaces.
+  // Return dimensions/required number of pictures that client should be ready
+  // to provide for the decoder to function properly (of which up to
+  // GetNumReferenceFrames() might be needed for internal decoding). To be used
+  // after Decode() returns kAllocateNewSurfaces.
  virtual gfx::Size GetPicSize() const = 0;
  virtual size_t GetRequiredNumOfPictures() const = 0;
+  virtual size_t GetNumReferenceFrames() const = 0;

  // About 3 secs for 30 fps video. When the new sized keyframe is missed, the
  // decoder cannot decode the frame. The number of frames are skipped until

--- a/media/gpu/h264_decoder.cc
+++ b/media/gpu/h264_decoder.cc
@@ -1444,7 +1444,17 @@ gfx::Size H264Decoder::GetPicSize() const {
 }

 size_t H264Decoder::GetRequiredNumOfPictures() const {
-  return dpb_.max_num_pics() + kPicsInPipeline;
+  constexpr size_t kPicsInPipeline = limits::kMaxVideoFrames + 1;
+  return GetNumReferenceFrames() + kPicsInPipeline;
+}
+
+size_t H264Decoder::GetNumReferenceFrames() const {
+  // Use the maximum number of pictures in the Decoded Picture Buffer plus one
+  // for the one being currently egressed.
+  // Another +1 is experimentally needed for high-to-high resolution changes.
+  // TODO(mcasas): Figure out why +2 instead of +1, see crbug.com/909926 and
+  // http://crrev.com/c/1363807/9/media/gpu/h264_decoder.cc#1449.
+  return dpb_.max_num_pics() + 2;
 }

 // static

--- a/media/gpu/h264_decoder.h
+++ b/media/gpu/h264_decoder.h
@@ -168,6 +168,7 @@ class MEDIA_GPU_EXPORT H264Decoder : public AcceleratedVideoDecoder {
  DecodeResult Decode() override WARN_UNUSED_RESULT;
  gfx::Size GetPicSize() const override;
  size_t GetRequiredNumOfPictures() const override;
+  size_t GetNumReferenceFrames() const override;

  // Return true if we need to start a new picture.
  static bool IsNewPrimaryCodedPicture(const H264Picture* curr_pic,
@@ -182,17 +183,6 @@ class MEDIA_GPU_EXPORT H264Decoder : public AcceleratedVideoDecoder {
                                             H264Picture* pic);

 private:
-  // We need to keep at most kDPBMaxSize pictures in DPB for
-  // reference/to display later and an additional one for the one currently
-  // being decoded. We also ask for some additional ones since VDA needs
-  // to accumulate a few ready-to-output pictures before it actually starts
-  // displaying and giving them back. +2 instead of +1 because of subjective
-  // smoothness improvement during testing.
-  enum {
-    kPicsInPipeline = limits::kMaxVideoFrames + 2,
-    kMaxNumReqPictures = H264DPB::kDPBMaxSize + kPicsInPipeline,
-  };
-
  // Internal state of the decoder.
  enum State {
    // After initialization, need an SPS.

--- a/media/gpu/vaapi/vaapi_video_decode_accelerator.cc
+++ b/media/gpu/vaapi/vaapi_video_decode_accelerator.cc
--- a/media/gpu/vaapi/vaapi_video_decode_accelerator.h
+++ b/media/gpu/vaapi/vaapi_video_decode_accelerator.h
@@ -178,9 +178,12 @@ class MEDIA_GPU_EXPORT VaapiVideoDecodeAccelerator
  // |available_va_surfaces_|
  void RecycleVASurfaceID(VASurfaceID va_surface_id);

-  // Initiate wait cycle for surfaces to be released before we release them
-  // and allocate new ones, as requested by the decoder.
-  void InitiateSurfaceSetChange(size_t num_pics, gfx::Size size);
+  // Request a new set of |num_pics| PictureBuffers to be allocated by
+  // |client_|. Up to |num_reference_frames| out of |num_pics_| might be needed
+  // by |decoder_|.
+  void InitiateSurfaceSetChange(size_t num_pics,
+                                gfx::Size size,
+                                size_t num_reference_frames);

  // Check if the surfaces have been released or post ourselves for later.
  void TryFinishSurfaceSetChange();
@@ -252,9 +255,13 @@ class MEDIA_GPU_EXPORT VaapiVideoDecodeAccelerator
  // Only used on |task_runner_|.
  base::queue<base::OnceClosure> pending_output_cbs_;

+  // TODO(crbug.com/912295): Enable these two for IMPORT |output_mode_| as well.
  // Under some circumstances, we can pass to libva our own VASurfaceIDs to
-  // decode onto, which skips one copy. Only used on |task_runner_|.
+  // decode onto, which skips one copy. see https://crbug.com/822346.
  bool decode_using_client_picture_buffers_;
+  // When |decode_using_client_picture_buffers_| is false and under certain
+  // conditions, we can reduce the number of necessary allocated buffers.
+  bool use_reduced_number_of_allocations_;

  // WeakPtr<> pointing to |this| for use in posting tasks from the decoder
  // thread back to the ChildThread.  Because the decoder thread is a member of
@@ -289,10 +296,14 @@ class MEDIA_GPU_EXPORT VaapiVideoDecodeAccelerator
  // to be returned before we can free them. Only used on |task_runner_|.
  bool awaiting_va_surfaces_recycle_;

-  // Last requested number/resolution of output picture buffers and their
-  // format.
+  // Last requested number/resolution of output PictureBuffers.
  size_t requested_num_pics_;
  gfx::Size requested_pic_size_;
+  // Max number of reference frames needed by |decoder_|. Only used on
+  // |task_runner_| and when |use_reduced_number_of_allocations_| is true.
+  size_t requested_num_reference_frames_;
+  size_t previously_requested_num_reference_frames_;
+
  VideoCodecProfile profile_;

  // Callback to make GL context current.

--- a/media/gpu/vaapi/vaapi_video_decode_accelerator_unittest.cc
+++ b/media/gpu/vaapi/vaapi_video_decode_accelerator_unittest.cc
@@ -40,7 +40,7 @@ struct TestParams {
 constexpr int32_t kBitstreamId = 123;
 constexpr size_t kInputSize = 256;

-constexpr size_t kNumPictures = 2;
+constexpr size_t kNumPictures = 4;
 const gfx::Size kPictureSize(64, 48);

 constexpr size_t kNewNumPictures = 3;
@@ -61,6 +61,7 @@ class MockAcceleratedVideoDecoder : public AcceleratedVideoDecoder {
  MOCK_METHOD0(Decode, DecodeResult());
  MOCK_CONST_METHOD0(GetPicSize, gfx::Size());
  MOCK_CONST_METHOD0(GetRequiredNumOfPictures, size_t());
+  MOCK_CONST_METHOD0(GetNumReferenceFrames, size_t());
 };

 class MockVaapiWrapper : public VaapiWrapper {
@@ -153,7 +154,8 @@ class VaapiVideoDecodeAcceleratorTest : public TestWithParam<TestParams>,
    decoder_thread_.Start();

    // Don't want to go through a vda_->Initialize() because it binds too many
-    // items of the environment. Instead, just start the decoder thread.
+    // items of the environment. Instead, do all the necessary steps here.
+
    vda_.decoder_thread_task_runner_ = decoder_thread_.task_runner();

    // Plug in all the mocks and ourselves as the |client_|.
@@ -163,10 +165,15 @@ class VaapiVideoDecodeAcceleratorTest : public TestWithParam<TestParams>,
    vda_.vpp_vaapi_wrapper_ = mock_vpp_vaapi_wrapper_;
    vda_.vaapi_picture_factory_.reset(mock_vaapi_picture_factory_);

+    // TODO(crbug.com/917999): add IMPORT mode to test variations.
    vda_.output_mode_ = VideoDecodeAccelerator::Config::OutputMode::ALLOCATE;

    vda_.decode_using_client_picture_buffers_ =
        GetParam().decode_using_client_picture_buffers;
+    vda_.use_reduced_number_of_allocations_ =
+        !vda_.decode_using_client_picture_buffers_ &&
+        vda_.output_mode_ ==
+            VideoDecodeAccelerator::Config::OutputMode::ALLOCATE;

    vda_.state_ = VaapiVideoDecodeAccelerator::kIdle;
  }
@@ -221,6 +228,9 @@ class VaapiVideoDecodeAcceleratorTest : public TestWithParam<TestParams>,
    EXPECT_CALL(*mock_decoder_, GetRequiredNumOfPictures())
        .WillOnce(Return(num_pictures));
    EXPECT_CALL(*mock_decoder_, GetPicSize()).WillOnce(Return(picture_size));
+    const size_t kNumReferenceFrames = num_pictures / 2;
+    EXPECT_CALL(*mock_decoder_, GetNumReferenceFrames())
+        .WillOnce(Return(kNumReferenceFrames));
    EXPECT_CALL(*mock_vaapi_wrapper_, DestroyContextAndSurfaces());

    if (expect_dismiss_picture_buffers) {
@@ -228,8 +238,14 @@ class VaapiVideoDecodeAcceleratorTest : public TestWithParam<TestParams>,
          .Times(num_picture_buffers_to_dismiss);
    }

+    const size_t expected_num_picture_buffers_requested =
+        vda_.use_reduced_number_of_allocations_
+            ? num_pictures - kNumReferenceFrames
+            : num_pictures;
+
    EXPECT_CALL(*this,
-                ProvidePictureBuffers(num_pictures, _, 1, picture_size, _))
+                ProvidePictureBuffers(expected_num_picture_buffers_requested, _,
+                                      1, picture_size, _))
        .WillOnce(RunClosure(quit_closure));

    base::SharedMemoryHandle handle;
@@ -263,17 +279,18 @@ class VaapiVideoDecodeAcceleratorTest : public TestWithParam<TestParams>,
          MockCreateVaapiPicture(mock_vaapi_wrapper_.get(), picture_size))
          .Times(num_pictures);
    } else {
-      EXPECT_CALL(*mock_vaapi_wrapper_,
-                  CreateContextAndSurfaces(_, picture_size, num_pictures, _))
+      const size_t kNumReferenceFrames = num_pictures / 2;
+      EXPECT_CALL(
+          *mock_vaapi_wrapper_,
+          CreateContextAndSurfaces(_, picture_size, kNumReferenceFrames, _))
          .WillOnce(DoAll(
-              WithArg<3>(Invoke(
-                  [num_pictures](std::vector<VASurfaceID>* va_surface_ids) {
-                    va_surface_ids->resize(num_pictures);
+              WithArg<3>(Invoke([kNumReferenceFrames](
+                                    std::vector<VASurfaceID>* va_surface_ids) {
+                va_surface_ids->resize(kNumReferenceFrames);
              })),
              Return(true)));
-      EXPECT_CALL(
-          *mock_vaapi_picture_factory_,
-          MockCreateVaapiPicture(mock_vpp_vaapi_wrapper_.get(), picture_size))
+      EXPECT_CALL(*mock_vaapi_picture_factory_,
+                  MockCreateVaapiPicture(_, picture_size))
          .Times(num_pictures);
    }


--- a/media/gpu/vp8_decoder.cc
+++ b/media/gpu/vp8_decoder.cc
@@ -7,6 +7,10 @@

 namespace media {

+namespace {
+constexpr size_t kVP8NumFramesActive = 4;
+};
+
 VP8Decoder::VP8Accelerator::VP8Accelerator() {}

 VP8Decoder::VP8Accelerator::~VP8Accelerator() {}
@@ -165,9 +169,14 @@ gfx::Size VP8Decoder::GetPicSize() const {
 }

 size_t VP8Decoder::GetRequiredNumOfPictures() const {
-  const size_t kVP8NumFramesActive = 4;
-  const size_t kPicsInPipeline = limits::kMaxVideoFrames + 2;
+  constexpr size_t kPicsInPipeline = limits::kMaxVideoFrames + 1;
  return kVP8NumFramesActive + kPicsInPipeline;
 }

+size_t VP8Decoder::GetNumReferenceFrames() const {
+  // Maximum number of reference frames needed plus one for the one being
+  // currently egressed.
+  return kVP8NumFramesActive + 1;
+}
+
 }  // namespace media
--- a/media/gpu/vp8_decoder.h
+++ b/media/gpu/vp8_decoder.h
@@ -72,6 +72,7 @@ class MEDIA_GPU_EXPORT VP8Decoder : public AcceleratedVideoDecoder {
  DecodeResult Decode() override WARN_UNUSED_RESULT;
  gfx::Size GetPicSize() const override;
  size_t GetRequiredNumOfPictures() const override;
+  size_t GetNumReferenceFrames() const override;

 private:
  bool DecodeAndOutputCurrentFrame(scoped_refptr<VP8Picture> pic);

--- a/media/gpu/vp9_decoder.cc
+++ b/media/gpu/vp9_decoder.cc
@@ -261,9 +261,14 @@ gfx::Size VP9Decoder::GetPicSize() const {
 }

 size_t VP9Decoder::GetRequiredNumOfPictures() const {
-  // kMaxVideoFrames to keep higher level media pipeline populated, +2 for the
-  // pictures being parsed and decoded currently.
-  return limits::kMaxVideoFrames + kVp9NumRefFrames + 2;
+  constexpr size_t kPicsInPipeline = limits::kMaxVideoFrames + 1;
+  return kPicsInPipeline + GetNumReferenceFrames();
+}
+
+size_t VP9Decoder::GetNumReferenceFrames() const {
+  // Maximum number of reference frames needed plus one for the one being
+  // currently egressed.
+  return kVp9NumRefFrames + 1;
 }

 }  // namespace media
--- a/media/gpu/vp9_decoder.h
+++ b/media/gpu/vp9_decoder.h
@@ -106,6 +106,7 @@ class MEDIA_GPU_EXPORT VP9Decoder : public AcceleratedVideoDecoder {
  DecodeResult Decode() override WARN_UNUSED_RESULT;
  gfx::Size GetPicSize() const override;
  size_t GetRequiredNumOfPictures() const override;
+  size_t GetNumReferenceFrames() const override;

 private:
  // Update ref_frames_ based on the information in current frame header.