Allow MP4 parser to handle multiple audio and video tracks

Added a new multi-track (2 audio + 2 video tracks) .mp4 file for tests and MP4StreamParser to handle multiple tracks properly. BUG=249427, 249428 Review-Url: https://codereview.chromium.org/2254733006 Cr-Commit-Position: refs/heads/master@{#414872}

Allow MP4 parser to handle multiple audio and video tracks
Added a new multi-track (2 audio + 2 video tracks) .mp4 file for tests and MP4StreamParser to handle multiple tracks properly. BUG=249427, 249428 Review-Url: https://codereview.chromium.org/2254733006 Cr-Commit-Position: refs/heads/master@{#414872}
634e7ec2 · servolk · Commit bot · 2063c5ec · 634e7ec2 · 634e7ec2
Commit 634e7ec2 authored Aug 26, 2016 by servolk Committed by Commit bot Aug 27, 2016
6 changed files
--- a/media/formats/mp4/avc.cc
+++ b/media/formats/mp4/avc.cc
@@ -65,6 +65,9 @@ bool AVC::ConvertFrameToAnnexB(int length_size,
                               std::vector<uint8_t>* buffer,
                               std::vector<SubsampleEntry>* subsamples) {
  RCHECK(length_size == 1 || length_size == 2 || length_size == 4);
+  DVLOG(5) << __FUNCTION__ << " length_size=" << length_size
+           << " buffer->size()=" << buffer->size()
+           << " subsamples=" << (subsamples ? subsamples->size() : 0);

  if (length_size == 4)
    return ConvertAVCToAnnexBInPlaceForLengthSize4(buffer);

--- a/media/formats/mp4/mp4_stream_parser.cc
+++ b/media/formats/mp4/mp4_stream_parser.cc
@@ -41,12 +41,8 @@ MP4StreamParser::MP4StreamParser(const std::set<int>& audio_object_types,
      highest_end_offset_(0),
      has_audio_(false),
      has_video_(false),
-      audio_track_id_(0),
-      video_track_id_(0),
      audio_object_types_(audio_object_types),
      has_sbr_(has_sbr),
-      is_audio_track_encrypted_(false),
-      is_video_track_encrypted_(false),
      num_top_level_box_skipped_(0) {
 }

@@ -186,6 +182,9 @@ bool MP4StreamParser::ParseMoov(BoxReader* reader) {
  moov_.reset(new Movie);
  RCHECK(moov_->Parse(reader));
  runs_.reset();
+  audio_track_ids_.clear();
+  video_track_ids_.clear();
+  is_track_encrypted_.clear();

  has_audio_ = false;
  has_video_ = false;
@@ -218,8 +217,6 @@ bool MP4StreamParser::ParseMoov(BoxReader* reader) {

    if (track->media.handler.type == kAudio) {
      detected_audio_track_count++;
-      if (audio_config.IsValidConfig())
-        continue;  // Skip other audio tracks once we found a supported one.

      RCHECK(!samp_descr.audio_entries.empty());

@@ -308,20 +305,30 @@ bool MP4StreamParser::ParseMoov(BoxReader* reader) {
        return false;
      }

-      is_audio_track_encrypted_ = entry.sinf.info.track_encryption.is_encrypted;
-      DVLOG(1) << "is_audio_track_encrypted_: " << is_audio_track_encrypted_;
+      uint32_t audio_track_id = track->header.track_id;
+      if (audio_track_ids_.find(audio_track_id) != audio_track_ids_.end()) {
+        MEDIA_LOG(ERROR, media_log_)
+            << "Audio track with track_id=" << audio_track_id
+            << " already present.";
+        return false;
+      }
+      bool is_track_encrypted = entry.sinf.info.track_encryption.is_encrypted;
+      is_track_encrypted_[audio_track_id] = is_track_encrypted;
      audio_config.Initialize(
          codec, sample_format, channel_layout, sample_per_second, extra_data,
-          is_audio_track_encrypted_ ? AesCtrEncryptionScheme() : Unencrypted(),
+          is_track_encrypted ? AesCtrEncryptionScheme() : Unencrypted(),
          base::TimeDelta(), 0);
+      DVLOG(1) << "audio_track_id=" << audio_track_id
+               << " config=" << audio_config.AsHumanReadableString();
      if (!audio_config.IsValidConfig()) {
        MEDIA_LOG(ERROR, media_log_) << "Invalid audio decoder config: "
                                     << audio_config.AsHumanReadableString();
        return false;
      }
      has_audio_ = true;
-      audio_track_id_ = track->header.track_id;
-      media_tracks->AddAudioTrack(audio_config, audio_track_id_, "main",
+      audio_track_ids_.insert(audio_track_id);
+      const char* track_kind = (audio_track_ids_.size() == 1 ? "main" : "");
+      media_tracks->AddAudioTrack(audio_config, audio_track_id, track_kind,
                                  track->media.handler.name,
                                  track->media.header.language());
      continue;
@@ -329,8 +336,6 @@ bool MP4StreamParser::ParseMoov(BoxReader* reader) {

    if (track->media.handler.type == kVideo) {
      detected_video_track_count++;
-      if (video_config.IsValidConfig())
-        continue;  // Skip other video tracks once we found a supported one.

      RCHECK(!samp_descr.video_entries.empty());
      if (desc_idx >= samp_descr.video_entries.size())
@@ -361,23 +366,33 @@ bool MP4StreamParser::ParseMoov(BoxReader* reader) {
            gfx::Size(track->header.width, track->header.height);
      }

-      is_video_track_encrypted_ = entry.sinf.info.track_encryption.is_encrypted;
-      DVLOG(1) << "is_video_track_encrypted_: " << is_video_track_encrypted_;
+      uint32_t video_track_id = track->header.track_id;
+      if (video_track_ids_.find(video_track_id) != video_track_ids_.end()) {
+        MEDIA_LOG(ERROR, media_log_)
+            << "Video track with track_id=" << video_track_id
+            << " already present.";
+        return false;
+      }
+      bool is_track_encrypted = entry.sinf.info.track_encryption.is_encrypted;
+      is_track_encrypted_[video_track_id] = is_track_encrypted;
      video_config.Initialize(
          entry.video_codec, entry.video_codec_profile, PIXEL_FORMAT_YV12,
          COLOR_SPACE_HD_REC709, coded_size, visible_rect, natural_size,
          // No decoder-specific buffer needed for AVC;
          // SPS/PPS are embedded in the video stream
          EmptyExtraData(),
-          is_video_track_encrypted_ ? AesCtrEncryptionScheme() : Unencrypted());
+          is_track_encrypted ? AesCtrEncryptionScheme() : Unencrypted());
+      DVLOG(1) << "video_track_id=" << video_track_id
+               << " config=" << video_config.AsHumanReadableString();
      if (!video_config.IsValidConfig()) {
        MEDIA_LOG(ERROR, media_log_) << "Invalid video decoder config: "
                                     << video_config.AsHumanReadableString();
        return false;
      }
      has_video_ = true;
-      video_track_id_ = track->header.track_id;
-      media_tracks->AddVideoTrack(video_config, video_track_id_, "main",
+      video_track_ids_.insert(video_track_id);
+      const char* track_kind = (video_track_ids_.size() == 1 ? "main" : "");
+      media_tracks->AddVideoTrack(video_config, video_track_id, track_kind,
                                  track->media.handler.name,
                                  track->media.header.language());
      continue;
@@ -514,8 +529,10 @@ bool MP4StreamParser::EnqueueSample(BufferQueueMap* buffers, bool* err) {
  queue_.Peek(&buf, &buf_size);
  if (!buf_size) return false;

-  bool audio = has_audio_ && audio_track_id_ == runs_->track_id();
-  bool video = has_video_ && video_track_id_ == runs_->track_id();
+  bool audio =
+      audio_track_ids_.find(runs_->track_id()) != audio_track_ids_.end();
+  bool video =
+      video_track_ids_.find(runs_->track_id()) != video_track_ids_.end();

  // Skip this entire track if it's not one we're interested in
  if (!audio && !video) {
@@ -585,8 +602,7 @@ bool MP4StreamParser::EnqueueSample(BufferQueueMap* buffers, bool* err) {
        subsamples));
    }
    // else, use the existing config.
-  } else if ((audio && is_audio_track_encrypted_) ||
-             (video && is_video_track_encrypted_)) {
+  } else if (is_track_encrypted_[runs_->track_id()]) {
    // The media pipeline requires a DecryptConfig with an empty |iv|.
    // TODO(ddorwin): Refactor so we do not need a fake key ID ("1");
    decrypt_config.reset(
@@ -596,9 +612,6 @@ bool MP4StreamParser::EnqueueSample(BufferQueueMap* buffers, bool* err) {
  StreamParserBuffer::Type buffer_type = audio ? DemuxerStream::AUDIO :
      DemuxerStream::VIDEO;

-  // TODO(wolenetz/acolwell): Validate and use a common cross-parser TrackId
-  // type and allow multiple tracks for same media type, if applicable. See
-  // https://crbug.com/341581.
  scoped_refptr<StreamParserBuffer> stream_buf = StreamParserBuffer::CopyFrom(
      &frame_buf[0], frame_buf.size(), runs_->is_keyframe(), buffer_type,
      runs_->track_id());
@@ -610,7 +623,8 @@ bool MP4StreamParser::EnqueueSample(BufferQueueMap* buffers, bool* err) {
  stream_buf->set_timestamp(runs_->cts());
  stream_buf->SetDecodeTimestamp(runs_->dts());

-  DVLOG(3) << "Pushing frame: aud=" << audio
+  DVLOG(3) << "Emit " << (audio ? "audio" : "video") << " frame: "
+           << " track_id=" << runs_->track_id()
           << ", key=" << runs_->is_keyframe()
           << ", dur=" << runs_->duration().InMilliseconds()
           << ", dts=" << runs_->dts().InMilliseconds()

--- a/media/formats/mp4/mp4_stream_parser.h
+++ b/media/formats/mp4/mp4_stream_parser.h
@@ -118,13 +118,12 @@ class MEDIA_EXPORT MP4StreamParser : public StreamParser {

  bool has_audio_;
  bool has_video_;
-  uint32_t audio_track_id_;
-  uint32_t video_track_id_;
+  std::set<uint32_t> audio_track_ids_;
+  std::set<uint32_t> video_track_ids_;
  // The object types allowed for audio tracks.
  std::set<int> audio_object_types_;
  bool has_sbr_;
-  bool is_audio_track_encrypted_;
-  bool is_video_track_encrypted_;
+  std::map<uint32_t, bool> is_track_encrypted_;

  // Tracks the number of MEDIA_LOGs for skipping top level boxes. Useful to
  // prevent log spam.

--- a/media/formats/mp4/mp4_stream_parser_unittest.cc
+++ b/media/formats/mp4/mp4_stream_parser_unittest.cc
@@ -125,14 +125,14 @@ class MP4StreamParserTest : public testing::Test {
      if (track->type() == MediaTrack::Audio) {
        audio_track_id_ = track_id;
        audio_decoder_config_ = tracks->getAudioConfig(track_id);
-        DVLOG(1) << "Audio track " << track_id << " config="
+        DVLOG(1) << "track_id=" << track_id << " audio config="
                 << (audio_decoder_config_.IsValidConfig()
                         ? audio_decoder_config_.AsHumanReadableString()
                         : "INVALID");
      } else if (track->type() == MediaTrack::Video) {
        video_track_id_ = track_id;
        video_decoder_config_ = tracks->getVideoConfig(track_id);
-        DVLOG(1) << "Video track " << track_id << " config="
+        DVLOG(1) << "track_id=" << track_id << " video config="
                 << (video_decoder_config_.IsValidConfig()
                         ? video_decoder_config_.AsHumanReadableString()
                         : "INVALID");
@@ -143,48 +143,33 @@ class MP4StreamParserTest : public testing::Test {
  }

  bool NewBuffersF(const StreamParser::BufferQueueMap& buffer_queue_map) {
-    // Ensure that track ids are properly assigned on all emitted buffers.
+    DecodeTimestamp lowest_end_dts = kNoDecodeTimestamp();
    for (const auto& it : buffer_queue_map) {
      DVLOG(3) << "Buffers for track_id=" << it.first;
+      DCHECK(!it.second.empty());
+
+      if (lowest_end_dts == kNoDecodeTimestamp() ||
+          lowest_end_dts > it.second.back()->GetDecodeTimestamp())
+        lowest_end_dts = it.second.back()->GetDecodeTimestamp();
+
      for (const auto& buf : it.second) {
        DVLOG(3) << "  track_id=" << buf->track_id()
                 << ", size=" << buf->data_size()
                 << ", pts=" << buf->timestamp().InSecondsF()
                 << ", dts=" << buf->GetDecodeTimestamp().InSecondsF()
                 << ", dur=" << buf->duration().InSecondsF();
+        // Ensure that track ids are properly assigned on all emitted buffers.
        EXPECT_EQ(it.first, buf->track_id());
      }
    }

-    const StreamParser::BufferQueue empty_buffers;
-    const auto& itr_audio = buffer_queue_map.find(audio_track_id_);
-    const StreamParser::BufferQueue& audio_buffers =
-        (itr_audio == buffer_queue_map.end()) ? empty_buffers
-                                              : itr_audio->second;
-
-    const auto& itr_video = buffer_queue_map.find(video_track_id_);
-    const StreamParser::BufferQueue& video_buffers =
-        (itr_video == buffer_queue_map.end()) ? empty_buffers
-                                              : itr_video->second;
-
-    // Find the second highest timestamp so that we know what the
-    // timestamps on the next set of buffers must be >= than.
-    DecodeTimestamp audio = !audio_buffers.empty() ?
-        audio_buffers.back()->GetDecodeTimestamp() : kNoDecodeTimestamp();
-    DecodeTimestamp video = !video_buffers.empty() ?
-        video_buffers.back()->GetDecodeTimestamp() : kNoDecodeTimestamp();
-    DecodeTimestamp second_highest_timestamp =
-        (audio == kNoDecodeTimestamp() ||
-         (video != kNoDecodeTimestamp() && audio > video)) ? video : audio;
-
-    EXPECT_NE(second_highest_timestamp, kNoDecodeTimestamp());
-
-    if (lower_bound_ != kNoDecodeTimestamp() &&
-        second_highest_timestamp < lower_bound_) {
+    EXPECT_NE(lowest_end_dts, kNoDecodeTimestamp());
+
+    if (lower_bound_ != kNoDecodeTimestamp() && lowest_end_dts < lower_bound_) {
      return false;
    }

-    lower_bound_ = second_highest_timestamp;
+    lower_bound_ = lowest_end_dts;
    return true;
  }

@@ -544,5 +529,47 @@ TEST_F(MP4StreamParserTest, TextTrackDetection) {
  EXPECT_TRUE(AppendDataInPieces(buffer->data(), buffer->data_size(), 512));
 }

+TEST_F(MP4StreamParserTest, MultiTrackFile) {
+  auto params = GetDefaultInitParametersExpectations();
+  params.duration = base::TimeDelta::FromMilliseconds(4248);
+  params.liveness = DemuxerStream::LIVENESS_RECORDED;
+  params.detected_audio_track_count = 2;
+  params.detected_video_track_count = 2;
+  InitializeParserWithInitParametersExpectations(params);
+  EXPECT_MEDIA_LOG(VideoCodecLog("avc1.64000D")).Times(2);
+  EXPECT_MEDIA_LOG(AudioCodecLog("mp4a.40.2")).Times(2);
+  ParseMP4File("bbb-320x240-2video-2audio.mp4", 4096);
+
+  EXPECT_EQ(media_tracks_->tracks().size(), 4u);
+
+  const MediaTrack& video_track1 = *(media_tracks_->tracks()[0]);
+  EXPECT_EQ(video_track1.type(), MediaTrack::Video);
+  EXPECT_EQ(video_track1.bytestream_track_id(), 1);
+  EXPECT_EQ(video_track1.kind(), "main");
+  EXPECT_EQ(video_track1.label(), "VideoHandler");
+  EXPECT_EQ(video_track1.language(), "und");
+
+  const MediaTrack& audio_track1 = *(media_tracks_->tracks()[1]);
+  EXPECT_EQ(audio_track1.type(), MediaTrack::Audio);
+  EXPECT_EQ(audio_track1.bytestream_track_id(), 2);
+  EXPECT_EQ(audio_track1.kind(), "main");
+  EXPECT_EQ(audio_track1.label(), "SoundHandler");
+  EXPECT_EQ(audio_track1.language(), "und");
+
+  const MediaTrack& video_track2 = *(media_tracks_->tracks()[2]);
+  EXPECT_EQ(video_track2.type(), MediaTrack::Video);
+  EXPECT_EQ(video_track2.bytestream_track_id(), 3);
+  EXPECT_EQ(video_track2.kind(), "");
+  EXPECT_EQ(video_track2.label(), "VideoHandler");
+  EXPECT_EQ(video_track2.language(), "und");
+
+  const MediaTrack& audio_track2 = *(media_tracks_->tracks()[3]);
+  EXPECT_EQ(audio_track2.type(), MediaTrack::Audio);
+  EXPECT_EQ(audio_track2.bytestream_track_id(), 4);
+  EXPECT_EQ(audio_track2.kind(), "");
+  EXPECT_EQ(audio_track2.label(), "SoundHandler");
+  EXPECT_EQ(audio_track2.language(), "und");
+}
+
 }  // namespace mp4
 }  // namespace media
--- a/media/test/data/README
+++ b/media/test/data/README
@@ -235,3 +235,16 @@ media/test/data/bear-1280x720-aac_he.ts
 media/test/data/bear-320x240-v_frag-hevc.mp4
  HEVC video stream in fragmented MP4 container, generated with
  ffmpeg -i bear-320x240.webm -c:v libx265 -an -movflags faststart+frag_keyframe bear-320x240-v_frag-hevc.mp4
+
+// Multi-track MP4 file
+// (c) copyright 2008, Blender Foundation / www.bigbuckbunny.org
+media/test/data/bbb-320x240-2video-2audio.mp4
+  Generated using following commands
+  // Download the source file with 1 video and 1 audio stream.
+  wget http://distribution.bbb3d.renderfarming.net/video/mp4/bbb_sunflower_1080p_30fps_normal.mp4
+  // Generate a scaled down to 320x240 video + 2 channel AAC LC audio from the source file.
+  ffmpeg -i bbb_sunflower_1080p_30fps_normal.mp4 -c:v libx264 -crf 36 -vf  scale=320:240 -c:a libfdk_aac -ac 2 -t 24 bbb1.mp4
+  // Generate a file with the original video scaled down to 320x240 and flipped upside down and sine wave instead of audio.
+  ffmpeg -i bbb_sunflower_1080p_30fps_normal.mp4 -f lavfi -i "sine=frequency=500:sample_rate=48000" -map 0:v -map 1:a -c:v libx264 -crf 36 -vf scale=320:240,vflip -c:a libfdk_aac -ac 2 -t 24 bbb2.mp4
+  // Combine the two files generated above into a single fragmented .mp4 file with 2 video and 2 audio tracks.
+  ffmpeg -i bbb1.mp4 -i bbb2.mp4 -map 0:0 -map 0:1 -map 1:0 -map 1:1 -c:v copy -c:a copy -movflags frag_keyframe+omit_tfhd_offset+separate_moof bbb-320x240-2video-2audio.mp4
--- a/media/test/data/bbb-320x240-2video-2audio.mp4
+++ b/media/test/data/bbb-320x240-2video-2audio.mp4