Implement actually decoding frames in VTVideoDecodeAccelerator.

This adds translation from Annex B to AVCC format along with decoding frames and binding them to textures. It seems that kVTDecodeFrame_EnableTemporalProcessing is just a suggestion to VideoToolbox, and one that it ignores. That means that, for now, this code only outputs frames in the correct order for I-frame only video. BUG=133828 Review URL: https://codereview.chromium.org/397883002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@287451 0039d316-1c4b-4281-b951-d872f2087c98

Implement actually decoding frames in VTVideoDecodeAccelerator.
This adds translation from Annex B to AVCC format along with decoding frames and binding them to textures. It seems that kVTDecodeFrame_EnableTemporalProcessing is just a suggestion to VideoToolbox, and one that it ignores. That means that, for now, this code only outputs frames in the correct order for I-frame only video. BUG=133828 Review URL: https://codereview.chromium.org/397883002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@287451 0039d316-1c4b-4281-b951-d872f2087c98
e68e3228 · sandersd@chromium.org · 65a6ec4a · e68e3228 · e68e3228 · e68e3228
Commit e68e3228 authored Aug 05, 2014 by sandersd@chromium.org
4 changed files
--- a/content/common/gpu/media/gpu_video_decode_accelerator.cc
+++ b/content/common/gpu/media/gpu_video_decode_accelerator.cc
@@ -370,11 +370,12 @@ void GpuVideoDecodeAccelerator::OnAssignPictureBuffers(
      NotifyError(media::VideoDecodeAccelerator::INVALID_ARGUMENT);
      return;
    }
-    if (texture_target_ == GL_TEXTURE_EXTERNAL_OES) {
-      // GL_TEXTURE_EXTERNAL_OES textures have their dimensions defined by the
-      // underlying EGLImage.  Use |texture_dimensions_| for this size.
+    if (texture_target_ == GL_TEXTURE_EXTERNAL_OES ||
+        texture_target_ == GL_TEXTURE_RECTANGLE) {
+      // These textures have their dimensions defined by the underlying storage.
+      // Use |texture_dimensions_| for this size.
      texture_manager->SetLevelInfo(texture_ref,
-                                    GL_TEXTURE_EXTERNAL_OES,
+                                    texture_target_,
                                    0,
                                    0,
                                    texture_dimensions_.width(),

--- a/content/common/gpu/media/vt_stubs_header.fragment
+++ b/content/common/gpu/media/vt_stubs_header.fragment
@@ -58,6 +58,12 @@ typedef OSStatus (*CMSampleBufferMakeDataReadyCallback)(
 typedef struct __CVBuffer *CVBufferRef;
 typedef CVBufferRef CVImageBufferRef;
 typedef uint32_t VTDecodeFrameFlags;
+enum {
+  kVTDecodeFrame_EnableAsynchronousDecompression = 1 << 0,
+  kVTDecodeFrame_DoNotOutputFrame = 1 << 1,
+  kVTDecodeFrame_1xRealTimePlayback = 1 << 2,
+  kVTDecodeFrame_EnableTemporalProcessing = 1 << 3,
+};
 typedef UInt32 VTDecodeInfoFlags;
 typedef struct OpaqueVTDecompressionSession* VTDecompressionSessionRef;


--- a/content/common/gpu/media/vt_video_decode_accelerator.cc
+++ b/content/common/gpu/media/vt_video_decode_accelerator.cc
@@ -7,10 +7,13 @@

 #include "base/bind.h"
 #include "base/command_line.h"
+#include "base/sys_byteorder.h"
 #include "base/thread_task_runner_handle.h"
 #include "content/common/gpu/media/vt_video_decode_accelerator.h"
 #include "content/public/common/content_switches.h"
 #include "media/filters/h264_parser.h"
+#include "ui/gl/scoped_binders.h"
+#include "ui/gl/scoped_cgl.h"

 using content_common_gpu_media::kModuleVt;
 using content_common_gpu_media::InitializeStubs;
@@ -19,9 +22,14 @@ using content_common_gpu_media::StubPathMap;

 namespace content {

-// Size of length headers prepended to NALUs in MPEG-4 framing. (1, 2, or 4.)
+// Size of NALU length headers in AVCC/MPEG-4 format (can be 1, 2, or 4).
 static const int kNALUHeaderLength = 4;

+// We only request 5 picture buffers from the client which are used to hold the
+// decoded samples. These buffers are then reused when the client tells us that
+// it is done with the buffer.
+static const int kNumPictureBuffers = 5;
+
 // Route decoded frame callbacks back into the VTVideoDecodeAccelerator.
 static void OutputThunk(
    void* decompression_output_refcon,
@@ -31,22 +39,31 @@ static void OutputThunk(
    CVImageBufferRef image_buffer,
    CMTime presentation_time_stamp,
    CMTime presentation_duration) {
+  // TODO(sandersd): Implement flush-before-delete to guarantee validity.
  VTVideoDecodeAccelerator* vda =
      reinterpret_cast<VTVideoDecodeAccelerator*>(decompression_output_refcon);
-  int32_t* bitstream_id_ptr = reinterpret_cast<int32_t*>(source_frame_refcon);
-  int32_t bitstream_id = *bitstream_id_ptr;
-  delete bitstream_id_ptr;
-  CFRetain(image_buffer);
-  vda->Output(bitstream_id, status, info_flags, image_buffer);
+  int32_t bitstream_id = reinterpret_cast<intptr_t>(source_frame_refcon);
+  vda->Output(bitstream_id, status, image_buffer);
+}
+
+VTVideoDecodeAccelerator::DecodedFrame::DecodedFrame(
+    int32_t bitstream_id,
+    CVImageBufferRef image_buffer)
+    : bitstream_id(bitstream_id),
+      image_buffer(image_buffer) {
+}
+
+VTVideoDecodeAccelerator::DecodedFrame::~DecodedFrame() {
 }

 VTVideoDecodeAccelerator::VTVideoDecodeAccelerator(CGLContextObj cgl_context)
    : cgl_context_(cgl_context),
      client_(NULL),
-      decoder_thread_("VTDecoderThread"),
      format_(NULL),
      session_(NULL),
-      weak_this_factory_(this) {
+      gpu_task_runner_(base::ThreadTaskRunnerHandle::Get()),
+      weak_this_factory_(this),
+      decoder_thread_("VTDecoderThread") {
  callback_.decompressionOutputCallback = OutputThunk;
  callback_.decompressionOutputRefCon = this;
 }
@@ -70,11 +87,11 @@ bool VTVideoDecodeAccelerator::Initialize(
    return false;

  if (!IsVtInitialized()) {
-    StubPathMap paths;
    // CoreVideo is also required, but the loader stops after the first
    // path is loaded. Instead we rely on the transitive dependency from
    // VideoToolbox to CoreVideo.
    // TODO(sandersd): Fallback to PrivateFrameworks for VideoToolbox.
+    StubPathMap paths;
    paths[kModuleVt].push_back(FILE_PATH_LITERAL(
        "/System/Library/Frameworks/VideoToolbox.framework/VideoToolbox"));
    if (!InitializeStubs(paths))
@@ -92,6 +109,9 @@ bool VTVideoDecodeAccelerator::Initialize(
 void VTVideoDecodeAccelerator::ConfigureDecoder(
    const std::vector<const uint8_t*>& nalu_data_ptrs,
    const std::vector<size_t>& nalu_data_sizes) {
+  DCHECK(decoder_thread_.message_loop_proxy()->BelongsToCurrentThread());
+  // Construct a new format description from the parameter sets.
+  // TODO(sandersd): Replace this with custom code to support OS X < 10.9.
  format_.reset();
  CHECK(!CMVideoFormatDescriptionCreateFromH264ParameterSets(
      kCFAllocatorDefault,
@@ -99,13 +119,11 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(
      &nalu_data_ptrs.front(),    // &parameter_set_pointers
      &nalu_data_sizes.front(),   // &parameter_set_sizes
      kNALUHeaderLength,          // nal_unit_header_length
-      format_.InitializeInto()
-  ));
-
-  // TODO(sandersd): Check if the size has changed and handle picture requests.
-  CMVideoDimensions coded_size = CMVideoFormatDescriptionGetDimensions(format_);
-  coded_size_.SetSize(coded_size.width, coded_size.height);
+      format_.InitializeInto()));
+  CMVideoDimensions coded_dimensions =
+      CMVideoFormatDescriptionGetDimensions(format_);

+  // Prepare VideoToolbox configuration dictionaries.
  base::ScopedCFTypeRef<CFMutableDictionaryRef> decoder_config(
      CFDictionaryCreateMutable(
          kCFAllocatorDefault,
@@ -126,12 +144,12 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(
          &kCFTypeDictionaryKeyCallBacks,
          &kCFTypeDictionaryValueCallBacks));

-  // TODO(sandersd): ARGB for video that is not 4:2:0.
-  int32_t pixel_format = '2vuy';
 #define CFINT(i) CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &i)
+  // TODO(sandersd): RGBA option for 4:4:4 video.
+  int32_t pixel_format = kCVPixelFormatType_422YpCbCr8;
  base::ScopedCFTypeRef<CFNumberRef> cf_pixel_format(CFINT(pixel_format));
-  base::ScopedCFTypeRef<CFNumberRef> cf_width(CFINT(coded_size.width));
-  base::ScopedCFTypeRef<CFNumberRef> cf_height(CFINT(coded_size.height));
+  base::ScopedCFTypeRef<CFNumberRef> cf_width(CFINT(coded_dimensions.width));
+  base::ScopedCFTypeRef<CFNumberRef> cf_height(CFINT(coded_dimensions.height));
 #undef CFINT
  CFDictionarySetValue(
      image_config, kCVPixelBufferPixelFormatTypeKey, cf_pixel_format);
@@ -140,8 +158,8 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(
  CFDictionarySetValue(
      image_config, kCVPixelBufferOpenGLCompatibilityKey, kCFBooleanTrue);

-  // TODO(sandersd): Skip if the session is compatible.
-  // TODO(sandersd): Flush frames when resetting.
+  // TODO(sandersd): Check if the session is already compatible.
+  // TODO(sandersd): Flush.
  session_.reset();
  CHECK(!VTDecompressionSessionCreate(
      kCFAllocatorDefault,
@@ -149,18 +167,29 @@ void VTVideoDecodeAccelerator::ConfigureDecoder(
      decoder_config,       // video_decoder_specification
      image_config,         // destination_image_buffer_attributes
      &callback_,           // output_callback
-      session_.InitializeInto()
-  ));
-  DVLOG(2) << "Created VTDecompressionSession";
+      session_.InitializeInto()));
+
+  // If the size has changed, trigger a request for new picture buffers.
+  gfx::Size new_coded_size(coded_dimensions.width, coded_dimensions.height);
+  if (coded_size_ != new_coded_size) {
+    coded_size_ = new_coded_size;
+    gpu_task_runner_->PostTask(FROM_HERE, base::Bind(
+        &VTVideoDecodeAccelerator::SizeChangedTask,
+        weak_this_factory_.GetWeakPtr(),
+        coded_size_));;
+  }
 }

 void VTVideoDecodeAccelerator::Decode(const media::BitstreamBuffer& bitstream) {
  DCHECK(CalledOnValidThread());
+  // TODO(sandersd): Test what happens if bitstream buffers are passed to VT out
+  // of order.
  decoder_thread_.message_loop_proxy()->PostTask(FROM_HERE, base::Bind(
      &VTVideoDecodeAccelerator::DecodeTask, base::Unretained(this),
      bitstream));
 }

+// TODO(sandersd): Proper error reporting instead of CHECKs.
 void VTVideoDecodeAccelerator::DecodeTask(
    const media::BitstreamBuffer bitstream) {
  DCHECK(decoder_thread_.message_loop_proxy()->BelongsToCurrentThread());
@@ -171,7 +200,12 @@ void VTVideoDecodeAccelerator::DecodeTask(
  CHECK(memory.Map(size));
  const uint8_t* buf = static_cast<uint8_t*>(memory.memory());

-  // Locate relevant NALUs in the buffer.
+  // NALUs are stored with Annex B format in the bitstream buffer (start codes),
+  // but VideoToolbox expects AVCC/MPEG-4 format (length headers), so we must
+  // rewrite the data.
+  //
+  // 1. Locate relevant NALUs and compute the size of the translated data.
+  //    Also record any parameter sets for VideoToolbox initialization.
  size_t data_size = 0;
  std::vector<media::H264NALU> nalus;
  std::vector<const uint8_t*> config_nalu_data_ptrs;
@@ -183,40 +217,170 @@ void VTVideoDecodeAccelerator::DecodeTask(
    if (result == media::H264Parser::kEOStream)
      break;
    CHECK_EQ(result, media::H264Parser::kOk);
+    // TODO(sandersd): Check that these are only at the start.
    if (nalu.nal_unit_type == media::H264NALU::kSPS ||
        nalu.nal_unit_type == media::H264NALU::kPPS ||
        nalu.nal_unit_type == media::H264NALU::kSPSExt) {
+      DVLOG(2) << "Parameter set " << nalu.nal_unit_type;
      config_nalu_data_ptrs.push_back(nalu.data);
      config_nalu_data_sizes.push_back(nalu.size);
-    }
+    } else {
      nalus.push_back(nalu);
-    // Each NALU will have a 4-byte length header prepended.
      data_size += kNALUHeaderLength + nalu.size;
    }
+  }

-  if (!config_nalu_data_ptrs.empty())
+  // 2. Initialize VideoToolbox.
+  // TODO(sandersd): Reinitialize when there are new parameter sets.
+  if (!session_)
    ConfigureDecoder(config_nalu_data_ptrs, config_nalu_data_sizes);

-  // TODO(sandersd): Rewrite slice NALU headers and send for decoding.
+  // 3. Allocate a memory-backed CMBlockBuffer for the translated data.
+  base::ScopedCFTypeRef<CMBlockBufferRef> data;
+  CHECK(!CMBlockBufferCreateWithMemoryBlock(
+      kCFAllocatorDefault,
+      NULL,                 // &memory_block
+      data_size,            // block_length
+      kCFAllocatorDefault,  // block_allocator
+      NULL,                 // &custom_block_source
+      0,                    // offset_to_data
+      data_size,            // data_length
+      0,                    // flags
+      data.InitializeInto()));
+
+  // 4. Copy NALU data, inserting length headers.
+  size_t offset = 0;
+  for (size_t i = 0; i < nalus.size(); i++) {
+    media::H264NALU& nalu = nalus[i];
+    uint32_t header = base::HostToNet32(static_cast<uint32_t>(nalu.size));
+    CHECK(!CMBlockBufferReplaceDataBytes(
+        &header, data, offset, kNALUHeaderLength));
+    offset += kNALUHeaderLength;
+    CHECK(!CMBlockBufferReplaceDataBytes(nalu.data, data, offset, nalu.size));
+    offset += nalu.size;
+  }
+
+  // 5. Package the data for VideoToolbox and request decoding.
+  base::ScopedCFTypeRef<CMSampleBufferRef> frame;
+  CHECK(!CMSampleBufferCreate(
+      kCFAllocatorDefault,
+      data,                 // data_buffer
+      true,                 // data_ready
+      NULL,                 // make_data_ready_callback
+      NULL,                 // make_data_ready_refcon
+      format_,              // format_description
+      1,                    // num_samples
+      0,                    // num_sample_timing_entries
+      NULL,                 // &sample_timing_array
+      0,                    // num_sample_size_entries
+      NULL,                 // &sample_size_array
+      frame.InitializeInto()));
+
+  // Asynchronous Decompression allows for parallel submission of frames
+  // (without it, DecodeFrame() does not return until the frame has been
+  // decoded). We don't enable Temporal Processing so that frames are always
+  // returned in decode order; this makes it easier to avoid deadlock.
+  VTDecodeFrameFlags decode_flags =
+      kVTDecodeFrame_EnableAsynchronousDecompression;
+
+  intptr_t bitstream_id = bitstream.id();
+  CHECK(!VTDecompressionSessionDecodeFrame(
+      session_,
+      frame,                                  // sample_buffer
+      decode_flags,                           // decode_flags
+      reinterpret_cast<void*>(bitstream_id),  // source_frame_refcon
+      NULL));                                 // &info_flags_out
 }

 // This method may be called on any VideoToolbox thread.
+// TODO(sandersd): Proper error reporting instead of CHECKs.
 void VTVideoDecodeAccelerator::Output(
    int32_t bitstream_id,
    OSStatus status,
-    VTDecodeInfoFlags info_flags,
    CVImageBufferRef image_buffer) {
-  // TODO(sandersd): Store the frame in a queue.
-  CFRelease(image_buffer);
+  CHECK(!status);
+  CHECK_EQ(CFGetTypeID(image_buffer), CVPixelBufferGetTypeID());
+  CFRetain(image_buffer);
+  gpu_task_runner_->PostTask(FROM_HERE, base::Bind(
+      &VTVideoDecodeAccelerator::OutputTask,
+      weak_this_factory_.GetWeakPtr(),
+      DecodedFrame(bitstream_id, image_buffer)));
+}
+
+void VTVideoDecodeAccelerator::OutputTask(DecodedFrame frame) {
+  DCHECK(CalledOnValidThread());
+  decoded_frames_.push(frame);
+  SendPictures();
+}
+
+void VTVideoDecodeAccelerator::SizeChangedTask(gfx::Size coded_size) {
+  DCHECK(CalledOnValidThread());
+  texture_size_ = coded_size;
+  // TODO(sandersd): Dismiss existing picture buffers.
+  client_->ProvidePictureBuffers(
+      kNumPictureBuffers, texture_size_, GL_TEXTURE_RECTANGLE_ARB);
 }

 void VTVideoDecodeAccelerator::AssignPictureBuffers(
    const std::vector<media::PictureBuffer>& pictures) {
  DCHECK(CalledOnValidThread());
+
+  for (size_t i = 0; i < pictures.size(); i++) {
+    CHECK(!texture_ids_.count(pictures[i].id()));
+    available_picture_ids_.push(pictures[i].id());
+    texture_ids_[pictures[i].id()] = pictures[i].texture_id();
+  }
+
+  // Pictures are not marked as uncleared until this method returns. They will
+  // become broken if they are used before that happens.
+  gpu_task_runner_->PostTask(FROM_HERE, base::Bind(
+      &VTVideoDecodeAccelerator::SendPictures,
+      weak_this_factory_.GetWeakPtr()));
 }

 void VTVideoDecodeAccelerator::ReusePictureBuffer(int32_t picture_id) {
  DCHECK(CalledOnValidThread());
+  DCHECK_EQ(CFGetRetainCount(picture_bindings_[picture_id]), 1);
+  picture_bindings_.erase(picture_id);
+  available_picture_ids_.push(picture_id);
+  SendPictures();
+}
+
+// TODO(sandersd): Proper error reporting instead of CHECKs.
+void VTVideoDecodeAccelerator::SendPictures() {
+  DCHECK(CalledOnValidThread());
+  if (available_picture_ids_.empty() || decoded_frames_.empty())
+    return;
+
+  gfx::ScopedCGLSetCurrentContext scoped_set_current_context(cgl_context_);
+  glEnable(GL_TEXTURE_RECTANGLE_ARB);
+
+  while (!available_picture_ids_.empty() && !decoded_frames_.empty()) {
+    int32_t picture_id = available_picture_ids_.front();
+    available_picture_ids_.pop();
+    DecodedFrame frame = decoded_frames_.front();
+    decoded_frames_.pop();
+    IOSurfaceRef surface = CVPixelBufferGetIOSurface(frame.image_buffer);
+
+    gfx::ScopedTextureBinder
+        texture_binder(GL_TEXTURE_RECTANGLE_ARB, texture_ids_[picture_id]);
+    CHECK(!CGLTexImageIOSurface2D(
+        cgl_context_,                 // ctx
+        GL_TEXTURE_RECTANGLE_ARB,     // target
+        GL_RGB,                       // internal_format
+        texture_size_.width(),        // width
+        texture_size_.height(),       // height
+        GL_YCBCR_422_APPLE,           // format
+        GL_UNSIGNED_SHORT_8_8_APPLE,  // type
+        surface,                      // io_surface
+        0));                          // plane
+
+    picture_bindings_[picture_id] = frame.image_buffer;
+    client_->PictureReady(media::Picture(picture_id, frame.bitstream_id));
+    client_->NotifyEndOfBitstreamBuffer(frame.bitstream_id);
+  }
+
+  glDisable(GL_TEXTURE_RECTANGLE_ARB);
 }

 void VTVideoDecodeAccelerator::Flush() {

--- a/content/common/gpu/media/vt_video_decode_accelerator.h
+++ b/content/common/gpu/media/vt_video_decode_accelerator.h
@@ -5,7 +5,11 @@
 #ifndef CONTENT_COMMON_GPU_MEDIA_VT_VIDEO_DECODE_ACCELERATOR_H_
 #define CONTENT_COMMON_GPU_MEDIA_VT_VIDEO_DECODE_ACCELERATOR_H_

-#include "base/basictypes.h"
+#include <stdint.h>
+
+#include <map>
+#include <queue>
+
 #include "base/mac/scoped_cftyperef.h"
 #include "base/memory/ref_counted.h"
 #include "base/memory/weak_ptr.h"
@@ -45,38 +49,74 @@ class VTVideoDecodeAccelerator
  virtual void Destroy() OVERRIDE;
  virtual bool CanDecodeOnIOThread() OVERRIDE;

-  // Called by VideoToolbox when a frame is decoded.
+  // Called by OutputThunk() when VideoToolbox finishes decoding a frame.
  void Output(
      int32_t bitstream_id,
      OSStatus status,
-      VTDecodeInfoFlags info_flags,
      CVImageBufferRef image_buffer);

 private:
-  // Configure a VideoToolbox decompression session from parameter set NALUs.
+  struct DecodedFrame {
+    DecodedFrame(int32_t bitstream_id, CVImageBufferRef image_buffer);
+    ~DecodedFrame();
+
+    int32_t bitstream_id;
+    base::ScopedCFTypeRef<CVImageBufferRef> image_buffer;
+  };
+
+  // Methods for interacting with VideoToolbox. Run on |decoder_thread_|.
  void ConfigureDecoder(
      const std::vector<const uint8_t*>& nalu_data_ptrs,
      const std::vector<size_t>& nalu_data_sizes);
-
-  // Decode a frame of bitstream.
  void DecodeTask(const media::BitstreamBuffer);

+  // Methods for interacting with |client_|. Run on |gpu_task_runner_|.
+  void OutputTask(DecodedFrame frame);
+  void SizeChangedTask(gfx::Size coded_size);
+  void SendPictures();
+
+  //
+  // GPU thread state.
+  //
  CGLContextObj cgl_context_;
  media::VideoDecodeAccelerator::Client* client_;
-  base::Thread decoder_thread_;
+  gfx::Size texture_size_;

-  // Decoder configuration (used only on decoder thread).
+  // Texture IDs of pictures.
+  // TODO(sandersd): A single map of structs holding picture data.
+  std::map<int32_t, uint32_t> texture_ids_;
+
+  // Pictures ready to be rendered to.
+  std::queue<int32_t> available_picture_ids_;
+
+  // Decoded frames ready to render.
+  std::queue<DecodedFrame> decoded_frames_;
+
+  // Image buffers kept alive while they are bound to pictures.
+  std::map<int32_t, base::ScopedCFTypeRef<CVImageBufferRef>> picture_bindings_;
+
+  //
+  // Decoder thread state.
+  //
  VTDecompressionOutputCallbackRecord callback_;
  base::ScopedCFTypeRef<CMFormatDescriptionRef> format_;
  base::ScopedCFTypeRef<VTDecompressionSessionRef> session_;
  media::H264Parser parser_;
  gfx::Size coded_size_;

-  // Member variables should appear before the WeakPtrFactory, to ensure
-  // that any WeakPtrs to Controller are invalidated before its members
-  // variable's destructors are executed, rendering them invalid.
+  //
+  // Unprotected shared state (set up and torn down on GPU thread).
+  //
+  scoped_refptr<base::SingleThreadTaskRunner> gpu_task_runner_;
+
+  // This WeakPtrFactory does not need to be last as its pointers are bound to
+  // the same thread it is destructed on (the GPU thread).
  base::WeakPtrFactory<VTVideoDecodeAccelerator> weak_this_factory_;

+  // Declared last to ensure that all decoder thread tasks complete before any
+  // state is destructed.
+  base::Thread decoder_thread_;
+
  DISALLOW_COPY_AND_ASSIGN(VTVideoDecodeAccelerator);
 };