Add audio perception and human presence detection to MediaPerceptionPrivate API.

BUG=803706 Change-Id: I5c1d7476d27e9ce35e9bd758351e6ebc22e43771 Reviewed-on: https://chromium-review.googlesource.com/884008Reviewed-by: Steven Bennetts <stevenjb@chromium.org> Reviewed-by: Toni Barzic <tbarzic@chromium.org> Commit-Queue: Wei Guan <weigua@chromium.org> Cr-Commit-Position: refs/heads/master@{#533181}

Add audio perception and human presence detection to MediaPerceptionPrivate API.
BUG=803706 Change-Id: I5c1d7476d27e9ce35e9bd758351e6ebc22e43771 Reviewed-on: https://chromium-review.googlesource.com/884008Reviewed-by: Steven Bennetts <stevenjb@chromium.org> Reviewed-by: Toni Barzic <tbarzic@chromium.org> Commit-Queue: Wei Guan <weigua@chromium.org> Cr-Commit-Position: refs/heads/master@{#533181}
c81c0cc3 · Wei Guan · Commit Bot · 824466d2 · c81c0cc3 · c81c0cc3
Commit c81c0cc3 authored Jan 31, 2018 by Wei Guan Committed by Commit Bot Jan 31, 2018
4 changed files
--- a/chromeos/dbus/proto/media_perception.proto
+++ b/chromeos/dbus/proto/media_perception.proto
@@ -20,6 +20,10 @@ message MediaPerception {
  // A single FramePerception message or array of perceptions (if reporting the
  // results from multiple frames).
  repeated FramePerception frame_perception = 2;
+
+  // A single AudioPerception message or array of audio perceptions (if
+  // reporting the results from multiple audio frames).
+  repeated AudioPerception audio_perception = 3;
 }

 // Used to transmit a history of image frames and their associated annotations.
@@ -72,8 +76,55 @@ message State {
 // This is the output of the MediaPerceptionSinkCalculator.
 message PerceptionSample {
  optional FramePerception frame_perception = 1;
+
  // The image frame data associated with the frame perception.
  optional ImageFrame image_frame = 2;
+
+  optional AudioPerception audio_perception = 3;
+}
+
+// Audio perception results for an audio frame.
+message AudioPerception {
+  // A timestamp in microseconds attached when this message was generated.
+  optional uint64 timestamp_us = 1;
+
+  // Audio localization results for an audio frame.
+  optional AudioLocalization audio_localization = 2;
+
+  // Human presence detection results for an audio frame.
+  optional AudioHumanPresenceDetection audio_human_presence_detection = 3;
+}
+
+// An estimate of the direction that the sound is coming from.
+message AudioLocalization {
+  // An angle in radians in the horizontal plane. It roughly points to the peak
+  // in the probability distribution of azimuth defined below.
+  optional double azimuth_radians = 1;
+
+  // A probability distribution for the current snapshot in time that shows the
+  // likelihood of a sound source being at a particular azimuth. For example,
+  // azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
+  // sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
+  // 0.4, respectively.
+  repeated double azimuth_scores = 2;
+}
+
+// Detection of human presence close to the microphone.
+message AudioHumanPresenceDetection {
+  // Indicates a probability in [0, 1] interval that a human has caused a sound
+  // close to the microphone.
+  optional double human_presence_likelihood = 1;
+
+  // Estimate of the noise spectrogram.
+  optional AudioSpectrogram noise_spectrogram = 2;
+
+  // Spectrogram of an audio frame.
+  optional AudioSpectrogram frame_spectrogram = 3;
+}
+
+// Spectrogram of an audio frame.
+message AudioSpectrogram {
+  repeated double values = 1;
 }

 // This message stores the image frame along with the meta data.
@@ -106,6 +157,50 @@ message FramePerception {

  // Latency measurement for a list of packet streams in drishti graph.
  repeated PacketLatency packet_latency = 6;
+
+  // Human presence detection results for a video frame.
+  optional VideoHumanPresenceDetection video_human_presence_detection = 7;
+}
+
+// Detection of human presence close to the camera.
+message VideoHumanPresenceDetection {
+  // Indicates a probability in [0, 1] interval that a human is present in the
+  // video frame.
+  optional double human_presence_likelihood = 1;
+
+  // Indicates a probability in [0, 1] interval that motion has been detected
+  // in the video frame.
+  optional double motion_detected_likelihood = 2;
+
+  // Type of lighting conditions.
+  enum LightCondition {
+    UNSPECIFIED = 0;
+
+    // No noticeable change occurred.
+    NO_CHANGE = 1;
+
+    // Light was switched on in the room.
+    TURNED_ON = 2;
+
+    // Light was switched off in the room.
+    TURNED_OFF = 3;
+
+    // Light gradually got dimmer (for example, due to a sunset).
+    DIMMER = 4;
+
+    // Light gradually got brighter (for example, due to a sunrise).
+    BRIGHTER = 5;
+
+    // Black frame detected - the current frame contains only noise.
+    BLACK_FRAME = 6;
+  }
+
+  // Indicates lighting condition in the video frame.
+  optional LightCondition light_condition = 3;
+
+  // Indicates a probability in [0, 1] interval that light condition value is
+  // correct.
+  optional double light_condition_likelihood = 4;
 }

 message Entity {

--- a/extensions/browser/api/media_perception_private/conversion_utils.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils.cc
@@ -12,6 +12,127 @@ namespace media_perception_private {

 namespace {

+std::unique_ptr<AudioSpectrogram> AudioSpectrogramProtoToIdl(
+    const mri::AudioSpectrogram& spectrogram) {
+  std::unique_ptr<AudioSpectrogram> spectrogram_result =
+      std::make_unique<AudioSpectrogram>();
+  if (spectrogram.values_size() > 0) {
+    spectrogram_result->values = std::make_unique<std::vector<double>>();
+    for (const auto& value : spectrogram.values()) {
+      spectrogram_result->values->emplace_back(value);
+    }
+  }
+  return spectrogram_result;
+}
+
+std::unique_ptr<AudioHumanPresenceDetection>
+AudioHumanPresenceDetectionProtoToIdl(
+    const mri::AudioHumanPresenceDetection& detection) {
+  std::unique_ptr<AudioHumanPresenceDetection> detection_result =
+      std::make_unique<AudioHumanPresenceDetection>();
+  if (detection.has_human_presence_likelihood()) {
+    detection_result->human_presence_likelihood =
+        std::make_unique<double>(detection.human_presence_likelihood());
+  }
+  if (detection.has_noise_spectrogram()) {
+    detection_result->noise_spectrogram =
+        AudioSpectrogramProtoToIdl(detection.noise_spectrogram());
+  }
+  if (detection.has_frame_spectrogram()) {
+    detection_result->frame_spectrogram =
+        AudioSpectrogramProtoToIdl(detection.frame_spectrogram());
+  }
+  return detection_result;
+}
+
+std::unique_ptr<AudioLocalization> AudioLocalizationProtoToIdl(
+    const mri::AudioLocalization& localization) {
+  std::unique_ptr<AudioLocalization> localization_result =
+      std::make_unique<AudioLocalization>();
+  if (localization.has_azimuth_radians()) {
+    localization_result->azimuth_radians =
+        std::make_unique<double>(localization.azimuth_radians());
+  }
+  if (localization.azimuth_scores_size() > 0) {
+    localization_result->azimuth_scores =
+        std::make_unique<std::vector<double>>();
+    for (const auto& score : localization.azimuth_scores()) {
+      localization_result->azimuth_scores->emplace_back(score);
+    }
+  }
+  return localization_result;
+}
+
+AudioPerception AudioPerceptionProtoToIdl(
+    const mri::AudioPerception& perception) {
+  AudioPerception perception_result;
+  if (perception.has_timestamp_us()) {
+    perception_result.timestamp_us =
+        std::make_unique<double>(perception.timestamp_us());
+  }
+  if (perception.has_audio_localization()) {
+    perception_result.audio_localization =
+        AudioLocalizationProtoToIdl(perception.audio_localization());
+  }
+  if (perception.has_audio_human_presence_detection()) {
+    perception_result.audio_human_presence_detection =
+        AudioHumanPresenceDetectionProtoToIdl(
+            perception.audio_human_presence_detection());
+  }
+  return perception_result;
+}
+
+LightCondition LightConditionProtoToIdl(
+    const mri::VideoHumanPresenceDetection::LightCondition& condition) {
+  switch (condition) {
+    case mri::VideoHumanPresenceDetection::UNSPECIFIED:
+      return LIGHT_CONDITION_UNSPECIFIED;
+    case mri::VideoHumanPresenceDetection::NO_CHANGE:
+      return LIGHT_CONDITION_NO_CHANGE;
+    case mri::VideoHumanPresenceDetection::TURNED_ON:
+      return LIGHT_CONDITION_TURNED_ON;
+    case mri::VideoHumanPresenceDetection::TURNED_OFF:
+      return LIGHT_CONDITION_TURNED_OFF;
+    case mri::VideoHumanPresenceDetection::DIMMER:
+      return LIGHT_CONDITION_DIMMER;
+    case mri::VideoHumanPresenceDetection::BRIGHTER:
+      return LIGHT_CONDITION_BRIGHTER;
+    case mri::VideoHumanPresenceDetection::BLACK_FRAME:
+      return LIGHT_CONDITION_BLACK_FRAME;
+    default:
+      NOTREACHED() << "Unknown light condition: " << condition;
+      return LIGHT_CONDITION_UNSPECIFIED;
+  }
+}
+
+std::unique_ptr<VideoHumanPresenceDetection>
+VideoHumanPresenceDetectionProtoToIdl(
+    const mri::VideoHumanPresenceDetection& detection) {
+  std::unique_ptr<VideoHumanPresenceDetection> detection_result =
+      std::make_unique<VideoHumanPresenceDetection>();
+  if (detection.has_human_presence_likelihood()) {
+    detection_result->human_presence_likelihood =
+        std::make_unique<double>(detection.human_presence_likelihood());
+  }
+
+  if (detection.has_motion_detected_likelihood()) {
+    detection_result->motion_detected_likelihood =
+        std::make_unique<double>(detection.motion_detected_likelihood());
+  }
+
+  if (detection.has_light_condition()) {
+    detection_result->light_condition =
+        LightConditionProtoToIdl(detection.light_condition());
+  }
+
+  if (detection.has_light_condition_likelihood()) {
+    detection_result->light_condition_likelihood =
+        std::make_unique<double>(detection.light_condition_likelihood());
+  }
+
+  return detection_result;
+}
+
 std::unique_ptr<Point> PointProtoToIdl(const mri::Point& point) {
  std::unique_ptr<Point> point_result = std::make_unique<Point>();
  if (point.has_x())
@@ -157,6 +278,11 @@ FramePerception FramePerceptionProtoToIdl(
          PacketLatencyProtoToIdl(packet_latency));
    }
  }
+  if (frame_perception.has_video_human_presence_detection()) {
+    frame_perception_result.video_human_presence_detection =
+        VideoHumanPresenceDetectionProtoToIdl(
+            frame_perception.video_human_presence_detection());
+  }
  return frame_perception_result;
 }

@@ -211,6 +337,11 @@ PerceptionSample PerceptionSampleProtoToIdl(
    perception_sample_result.image_frame = std::make_unique<ImageFrame>(
        ImageFrameProtoToIdl(perception_sample.image_frame()));
  }
+  if (perception_sample.has_audio_perception()) {
+    perception_sample_result.audio_perception =
+        std::make_unique<AudioPerception>(
+            AudioPerceptionProtoToIdl(perception_sample.audio_perception()));
+  }
  return perception_sample_result;
 }

@@ -330,6 +461,16 @@ MediaPerception MediaPerceptionProtoToIdl(
          FramePerceptionProtoToIdl(frame_perception));
    }
  }
+
+  if (media_perception.audio_perception_size() > 0) {
+    media_perception_result.audio_perceptions =
+        std::make_unique<std::vector<AudioPerception>>();
+    for (const auto& audio_perception : media_perception.audio_perception()) {
+      media_perception_result.audio_perceptions->emplace_back(
+          AudioPerceptionProtoToIdl(audio_perception));
+    }
+  }
+
  return media_perception_result;
 }


--- a/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
@@ -40,6 +40,29 @@ void InitializeVideoStreamParam(media_perception::VideoStreamParam& param,
  param.frame_rate = std::make_unique<int>(frame_rate);
 }

+void InitializeFakeAudioPerception(mri::AudioPerception* audio_perception) {
+  audio_perception->set_timestamp_us(10086);
+
+  mri::AudioLocalization* audio_localization =
+      audio_perception->mutable_audio_localization();
+  audio_localization->set_azimuth_radians(1.5);
+  audio_localization->add_azimuth_scores(2.0);
+  audio_localization->add_azimuth_scores(5.0);
+
+  mri::AudioHumanPresenceDetection* detection =
+      audio_perception->mutable_audio_human_presence_detection();
+  detection->set_human_presence_likelihood(0.4);
+
+  mri::AudioSpectrogram* noise_spectrogram =
+      detection->mutable_noise_spectrogram();
+  noise_spectrogram->add_values(0.1);
+  noise_spectrogram->add_values(0.2);
+
+  mri::AudioSpectrogram* frame_spectrogram =
+      detection->mutable_frame_spectrogram();
+  frame_spectrogram->add_values(0.3);
+}
+
 void InitializeFakeFramePerception(const int index,
                                   mri::FramePerception* frame_perception) {
  frame_perception->set_frame_id(index);
@@ -92,6 +115,14 @@ void InitializeFakeFramePerception(const int index,
  mri::Entity* entity_three = frame_perception->add_entity();
  entity_three->set_type(mri::Entity::LABELED_REGION);
  entity_three->set_label(kFakeEntityLabel3);
+
+  // Add fake video human presence detection.
+  mri::VideoHumanPresenceDetection* detection =
+      frame_perception->mutable_video_human_presence_detection();
+  detection->set_human_presence_likelihood(0.1);
+  detection->set_motion_detected_likelihood(0.2);
+  detection->set_light_condition(mri::VideoHumanPresenceDetection::BLACK_FRAME);
+  detection->set_light_condition_likelihood(0.3);
 }

 void ValidateFramePerceptionResult(
@@ -178,6 +209,54 @@ void ValidateFramePerceptionResult(
  EXPECT_EQ(*entity_result_three.entity_label, kFakeEntityLabel3);
  EXPECT_EQ(entity_result_three.type,
            media_perception::ENTITY_TYPE_LABELED_REGION);
+
+  // Validate video human presence detection.
+  const media_perception::VideoHumanPresenceDetection* detection_result =
+      frame_perception_result.video_human_presence_detection.get();
+  ASSERT_TRUE(detection_result->human_presence_likelihood);
+  EXPECT_EQ(*detection_result->human_presence_likelihood, 0.1);
+  ASSERT_TRUE(detection_result->motion_detected_likelihood);
+  EXPECT_EQ(*detection_result->motion_detected_likelihood, 0.2);
+  EXPECT_EQ(detection_result->light_condition,
+            media_perception::LIGHT_CONDITION_BLACK_FRAME);
+  ASSERT_TRUE(detection_result->light_condition_likelihood);
+  EXPECT_EQ(*detection_result->light_condition_likelihood, 0.3);
+}
+
+void ValidateAudioPerceptionResult(
+    const media_perception::AudioPerception& audio_perception_result) {
+  ASSERT_TRUE(audio_perception_result.timestamp_us);
+  EXPECT_EQ(*audio_perception_result.timestamp_us, 10086);
+
+  // Validate audio localization.
+  const media_perception::AudioLocalization* audio_localization =
+      audio_perception_result.audio_localization.get();
+  ASSERT_TRUE(audio_localization);
+  ASSERT_TRUE(audio_localization->azimuth_radians);
+  EXPECT_EQ(*audio_localization->azimuth_radians, 1.5);
+  ASSERT_EQ(2u, audio_localization->azimuth_scores->size());
+  EXPECT_EQ(audio_localization->azimuth_scores->at(0), 2.0);
+  EXPECT_EQ(audio_localization->azimuth_scores->at(1), 5.0);
+
+  // Validate audio human presence detection.
+  const media_perception::AudioHumanPresenceDetection* presence_detection =
+      audio_perception_result.audio_human_presence_detection.get();
+  ASSERT_TRUE(presence_detection);
+  ASSERT_TRUE(presence_detection->human_presence_likelihood);
+  EXPECT_EQ(*presence_detection->human_presence_likelihood, 0.4);
+
+  const media_perception::AudioSpectrogram* noise_spectrogram =
+      presence_detection->noise_spectrogram.get();
+  ASSERT_TRUE(noise_spectrogram);
+  ASSERT_EQ(2u, noise_spectrogram->values->size());
+  EXPECT_EQ(noise_spectrogram->values->at(0), 0.1);
+  EXPECT_EQ(noise_spectrogram->values->at(1), 0.2);
+
+  const media_perception::AudioSpectrogram* frame_spectrogram =
+      presence_detection->frame_spectrogram.get();
+  ASSERT_TRUE(frame_spectrogram);
+  ASSERT_EQ(1u, frame_spectrogram->values->size());
+  EXPECT_EQ(frame_spectrogram->values->at(0), 0.3);
 }

 void InitializeFakeImageFrameData(mri::ImageFrame* image_frame) {
@@ -214,13 +293,20 @@ TEST(MediaPerceptionConversionUtilsTest, MediaPerceptionProtoToIdl) {
  mri::FramePerception* frame_perception =
      media_perception.add_frame_perception();
  InitializeFakeFramePerception(kFrameId, frame_perception);
+  mri::AudioPerception* audio_perception =
+      media_perception.add_audio_perception();
+  InitializeFakeAudioPerception(audio_perception);
  media_perception::MediaPerception media_perception_result =
      media_perception::MediaPerceptionProtoToIdl(media_perception);
  EXPECT_EQ(*media_perception_result.timestamp, 1);
  ASSERT_TRUE(media_perception_result.frame_perceptions);
  ASSERT_EQ(1u, media_perception_result.frame_perceptions->size());
+  ASSERT_TRUE(media_perception_result.audio_perceptions);
+  ASSERT_EQ(1u, media_perception_result.audio_perceptions->size());
  ValidateFramePerceptionResult(
      kFrameId, media_perception_result.frame_perceptions->at(0));
+  ValidateAudioPerceptionResult(
+      media_perception_result.audio_perceptions->at(0));
 }

 TEST(MediaPerceptionConversionUtilsTest, DiagnosticsProtoToIdl) {

--- a/extensions/common/api/media_perception_private.idl
+++ b/extensions/common/api/media_perception_private.idl
@@ -192,6 +192,47 @@ namespace mediaPerceptionPrivate {
    long? latencyUsec;
  };

+  // Type of lighting conditions.
+  enum LightCondition {
+    UNSPECIFIED,
+
+    // No noticeable change occurred.
+    NO_CHANGE,
+
+    // Light was switched on in the room.
+    TURNED_ON,
+
+    // Light was switched off in the room.
+    TURNED_OFF,
+
+    // Light gradually got dimmer (for example, due to a sunset).
+    DIMMER,
+
+    // Light gradually got brighter (for example, due to a sunrise).
+    BRIGHTER,
+
+    // Black frame was detected - the current frame contains only noise.
+    BLACK_FRAME
+  };
+
+  // Detection of human presence close to the camera.
+  dictionary VideoHumanPresenceDetection {
+    // Indicates a probability in [0, 1] interval that a human is present in
+    // the video frame.
+    double? humanPresenceLikelihood;
+
+    // Indicates a probability in [0, 1] that motion has been detected in the
+    // video frame.
+    double? motionDetectedLikelihood;
+
+    // Indicates lighting condition in the video frame.
+    LightCondition? lightCondition;
+
+    // Indicates a probablity in [0, 1] interval that
+    // <code>lightCondition</code> value is correct.
+    double? lightConditionLikelihood;
+  };
+
  // The set of computer vision metadata for an image frame.
  dictionary FramePerception {
    long? frameId;
@@ -208,6 +249,53 @@ namespace mediaPerceptionPrivate {

    // Processing latency for a list of packets.
    PacketLatency[]? packetLatency;
+
+    // Human presence detection results for a video frame.
+    VideoHumanPresenceDetection? videoHumanPresenceDetection;
+  };
+
+  // An estimate of the direction that the sound is coming from.
+  dictionary AudioLocalization {
+    // An angle in radians in the horizontal plane. It roughly points to the
+    // peak in the probability distribution of azimuth defined below.
+    double? azimuthRadians;
+
+    // A probability distribution for the current snapshot in time that shows
+    // the likelihood of a sound source being at a particular azimuth. For
+    // example, <code>azimuthScores = [0.1, 0.2, 0.3, 0.4]</code> means that
+    // the probability that the sound is coming from an azimuth of 0, pi/2, pi,
+    // 3*pi/2 is 0.1, 0.2, 0.3 and 0.4, respectively.
+    double[]? azimuthScores;
+  };
+
+  // Spectrogram of an audio frame.
+  dictionary AudioSpectrogram {
+    double[]? values;
+  };
+
+  // Detection of human presence close to the microphone.
+  dictionary AudioHumanPresenceDetection {
+    // Indicates a probability in [0, 1] interval that a human has caused a
+    // sound close to the microphone.
+    double? humanPresenceLikelihood;
+
+    // Estimate of the noise spectrogram.
+    AudioSpectrogram? noiseSpectrogram;
+
+    // Spectrogram of an audio frame.
+    AudioSpectrogram? frameSpectrogram;
+  };
+
+  // Audio perception results for an audio frame.
+  dictionary AudioPerception {
+    // A timestamp in microseconds attached when this message was generated.
+    double? timestampUs;
+
+    // Audio localization results for an audio frame.
+    AudioLocalization? audioLocalization;
+
+    // Audio human presence detection results for an audio frame.
+    AudioHumanPresenceDetection? audioHumanPresenceDetection;
  };

  dictionary MediaPerception {
@@ -219,6 +307,9 @@ namespace mediaPerceptionPrivate {

    // An array of framePerceptions.
    FramePerception[]? framePerceptions;
+
+    // An array of audio perceptions.
+    AudioPerception[]? audioPerceptions;
  };

  enum ImageFormat {
@@ -247,6 +338,9 @@ namespace mediaPerceptionPrivate {

    // The image frame data for the associated FramePerception object.
    ImageFrame? imageFrame;
+
+    // The audio perception results for an audio frame.
+    AudioPerception? audioPerception;
  };

  dictionary Diagnostics {