Commit c81c0cc3 authored by Wei Guan's avatar Wei Guan Committed by Commit Bot

Add audio perception and human presence detection to MediaPerceptionPrivate API.

BUG=803706

Change-Id: I5c1d7476d27e9ce35e9bd758351e6ebc22e43771
Reviewed-on: https://chromium-review.googlesource.com/884008Reviewed-by: default avatarSteven Bennetts <stevenjb@chromium.org>
Reviewed-by: default avatarToni Barzic <tbarzic@chromium.org>
Commit-Queue: Wei Guan <weigua@chromium.org>
Cr-Commit-Position: refs/heads/master@{#533181}
parent 824466d2
......@@ -20,6 +20,10 @@ message MediaPerception {
// A single FramePerception message or array of perceptions (if reporting the
// results from multiple frames).
repeated FramePerception frame_perception = 2;
// A single AudioPerception message or array of audio perceptions (if
// reporting the results from multiple audio frames).
repeated AudioPerception audio_perception = 3;
}
// Used to transmit a history of image frames and their associated annotations.
......@@ -72,8 +76,55 @@ message State {
// This is the output of the MediaPerceptionSinkCalculator.
message PerceptionSample {
optional FramePerception frame_perception = 1;
// The image frame data associated with the frame perception.
optional ImageFrame image_frame = 2;
optional AudioPerception audio_perception = 3;
}
// Audio perception results for an audio frame.
message AudioPerception {
// A timestamp in microseconds attached when this message was generated.
optional uint64 timestamp_us = 1;
// Audio localization results for an audio frame.
optional AudioLocalization audio_localization = 2;
// Human presence detection results for an audio frame.
optional AudioHumanPresenceDetection audio_human_presence_detection = 3;
}
// An estimate of the direction that the sound is coming from.
message AudioLocalization {
// An angle in radians in the horizontal plane. It roughly points to the peak
// in the probability distribution of azimuth defined below.
optional double azimuth_radians = 1;
// A probability distribution for the current snapshot in time that shows the
// likelihood of a sound source being at a particular azimuth. For example,
// azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
// sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
// 0.4, respectively.
repeated double azimuth_scores = 2;
}
// Detection of human presence close to the microphone.
message AudioHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human has caused a sound
// close to the microphone.
optional double human_presence_likelihood = 1;
// Estimate of the noise spectrogram.
optional AudioSpectrogram noise_spectrogram = 2;
// Spectrogram of an audio frame.
optional AudioSpectrogram frame_spectrogram = 3;
}
// Spectrogram of an audio frame.
message AudioSpectrogram {
repeated double values = 1;
}
// This message stores the image frame along with the meta data.
......@@ -106,6 +157,50 @@ message FramePerception {
// Latency measurement for a list of packet streams in drishti graph.
repeated PacketLatency packet_latency = 6;
// Human presence detection results for a video frame.
optional VideoHumanPresenceDetection video_human_presence_detection = 7;
}
// Detection of human presence close to the camera.
message VideoHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present in the
// video frame.
optional double human_presence_likelihood = 1;
// Indicates a probability in [0, 1] interval that motion has been detected
// in the video frame.
optional double motion_detected_likelihood = 2;
// Type of lighting conditions.
enum LightCondition {
UNSPECIFIED = 0;
// No noticeable change occurred.
NO_CHANGE = 1;
// Light was switched on in the room.
TURNED_ON = 2;
// Light was switched off in the room.
TURNED_OFF = 3;
// Light gradually got dimmer (for example, due to a sunset).
DIMMER = 4;
// Light gradually got brighter (for example, due to a sunrise).
BRIGHTER = 5;
// Black frame detected - the current frame contains only noise.
BLACK_FRAME = 6;
}
// Indicates lighting condition in the video frame.
optional LightCondition light_condition = 3;
// Indicates a probability in [0, 1] interval that light condition value is
// correct.
optional double light_condition_likelihood = 4;
}
message Entity {
......
......@@ -12,6 +12,127 @@ namespace media_perception_private {
namespace {
std::unique_ptr<AudioSpectrogram> AudioSpectrogramProtoToIdl(
const mri::AudioSpectrogram& spectrogram) {
std::unique_ptr<AudioSpectrogram> spectrogram_result =
std::make_unique<AudioSpectrogram>();
if (spectrogram.values_size() > 0) {
spectrogram_result->values = std::make_unique<std::vector<double>>();
for (const auto& value : spectrogram.values()) {
spectrogram_result->values->emplace_back(value);
}
}
return spectrogram_result;
}
std::unique_ptr<AudioHumanPresenceDetection>
AudioHumanPresenceDetectionProtoToIdl(
const mri::AudioHumanPresenceDetection& detection) {
std::unique_ptr<AudioHumanPresenceDetection> detection_result =
std::make_unique<AudioHumanPresenceDetection>();
if (detection.has_human_presence_likelihood()) {
detection_result->human_presence_likelihood =
std::make_unique<double>(detection.human_presence_likelihood());
}
if (detection.has_noise_spectrogram()) {
detection_result->noise_spectrogram =
AudioSpectrogramProtoToIdl(detection.noise_spectrogram());
}
if (detection.has_frame_spectrogram()) {
detection_result->frame_spectrogram =
AudioSpectrogramProtoToIdl(detection.frame_spectrogram());
}
return detection_result;
}
std::unique_ptr<AudioLocalization> AudioLocalizationProtoToIdl(
const mri::AudioLocalization& localization) {
std::unique_ptr<AudioLocalization> localization_result =
std::make_unique<AudioLocalization>();
if (localization.has_azimuth_radians()) {
localization_result->azimuth_radians =
std::make_unique<double>(localization.azimuth_radians());
}
if (localization.azimuth_scores_size() > 0) {
localization_result->azimuth_scores =
std::make_unique<std::vector<double>>();
for (const auto& score : localization.azimuth_scores()) {
localization_result->azimuth_scores->emplace_back(score);
}
}
return localization_result;
}
AudioPerception AudioPerceptionProtoToIdl(
const mri::AudioPerception& perception) {
AudioPerception perception_result;
if (perception.has_timestamp_us()) {
perception_result.timestamp_us =
std::make_unique<double>(perception.timestamp_us());
}
if (perception.has_audio_localization()) {
perception_result.audio_localization =
AudioLocalizationProtoToIdl(perception.audio_localization());
}
if (perception.has_audio_human_presence_detection()) {
perception_result.audio_human_presence_detection =
AudioHumanPresenceDetectionProtoToIdl(
perception.audio_human_presence_detection());
}
return perception_result;
}
LightCondition LightConditionProtoToIdl(
const mri::VideoHumanPresenceDetection::LightCondition& condition) {
switch (condition) {
case mri::VideoHumanPresenceDetection::UNSPECIFIED:
return LIGHT_CONDITION_UNSPECIFIED;
case mri::VideoHumanPresenceDetection::NO_CHANGE:
return LIGHT_CONDITION_NO_CHANGE;
case mri::VideoHumanPresenceDetection::TURNED_ON:
return LIGHT_CONDITION_TURNED_ON;
case mri::VideoHumanPresenceDetection::TURNED_OFF:
return LIGHT_CONDITION_TURNED_OFF;
case mri::VideoHumanPresenceDetection::DIMMER:
return LIGHT_CONDITION_DIMMER;
case mri::VideoHumanPresenceDetection::BRIGHTER:
return LIGHT_CONDITION_BRIGHTER;
case mri::VideoHumanPresenceDetection::BLACK_FRAME:
return LIGHT_CONDITION_BLACK_FRAME;
default:
NOTREACHED() << "Unknown light condition: " << condition;
return LIGHT_CONDITION_UNSPECIFIED;
}
}
std::unique_ptr<VideoHumanPresenceDetection>
VideoHumanPresenceDetectionProtoToIdl(
const mri::VideoHumanPresenceDetection& detection) {
std::unique_ptr<VideoHumanPresenceDetection> detection_result =
std::make_unique<VideoHumanPresenceDetection>();
if (detection.has_human_presence_likelihood()) {
detection_result->human_presence_likelihood =
std::make_unique<double>(detection.human_presence_likelihood());
}
if (detection.has_motion_detected_likelihood()) {
detection_result->motion_detected_likelihood =
std::make_unique<double>(detection.motion_detected_likelihood());
}
if (detection.has_light_condition()) {
detection_result->light_condition =
LightConditionProtoToIdl(detection.light_condition());
}
if (detection.has_light_condition_likelihood()) {
detection_result->light_condition_likelihood =
std::make_unique<double>(detection.light_condition_likelihood());
}
return detection_result;
}
std::unique_ptr<Point> PointProtoToIdl(const mri::Point& point) {
std::unique_ptr<Point> point_result = std::make_unique<Point>();
if (point.has_x())
......@@ -157,6 +278,11 @@ FramePerception FramePerceptionProtoToIdl(
PacketLatencyProtoToIdl(packet_latency));
}
}
if (frame_perception.has_video_human_presence_detection()) {
frame_perception_result.video_human_presence_detection =
VideoHumanPresenceDetectionProtoToIdl(
frame_perception.video_human_presence_detection());
}
return frame_perception_result;
}
......@@ -211,6 +337,11 @@ PerceptionSample PerceptionSampleProtoToIdl(
perception_sample_result.image_frame = std::make_unique<ImageFrame>(
ImageFrameProtoToIdl(perception_sample.image_frame()));
}
if (perception_sample.has_audio_perception()) {
perception_sample_result.audio_perception =
std::make_unique<AudioPerception>(
AudioPerceptionProtoToIdl(perception_sample.audio_perception()));
}
return perception_sample_result;
}
......@@ -330,6 +461,16 @@ MediaPerception MediaPerceptionProtoToIdl(
FramePerceptionProtoToIdl(frame_perception));
}
}
if (media_perception.audio_perception_size() > 0) {
media_perception_result.audio_perceptions =
std::make_unique<std::vector<AudioPerception>>();
for (const auto& audio_perception : media_perception.audio_perception()) {
media_perception_result.audio_perceptions->emplace_back(
AudioPerceptionProtoToIdl(audio_perception));
}
}
return media_perception_result;
}
......
......@@ -40,6 +40,29 @@ void InitializeVideoStreamParam(media_perception::VideoStreamParam& param,
param.frame_rate = std::make_unique<int>(frame_rate);
}
void InitializeFakeAudioPerception(mri::AudioPerception* audio_perception) {
audio_perception->set_timestamp_us(10086);
mri::AudioLocalization* audio_localization =
audio_perception->mutable_audio_localization();
audio_localization->set_azimuth_radians(1.5);
audio_localization->add_azimuth_scores(2.0);
audio_localization->add_azimuth_scores(5.0);
mri::AudioHumanPresenceDetection* detection =
audio_perception->mutable_audio_human_presence_detection();
detection->set_human_presence_likelihood(0.4);
mri::AudioSpectrogram* noise_spectrogram =
detection->mutable_noise_spectrogram();
noise_spectrogram->add_values(0.1);
noise_spectrogram->add_values(0.2);
mri::AudioSpectrogram* frame_spectrogram =
detection->mutable_frame_spectrogram();
frame_spectrogram->add_values(0.3);
}
void InitializeFakeFramePerception(const int index,
mri::FramePerception* frame_perception) {
frame_perception->set_frame_id(index);
......@@ -92,6 +115,14 @@ void InitializeFakeFramePerception(const int index,
mri::Entity* entity_three = frame_perception->add_entity();
entity_three->set_type(mri::Entity::LABELED_REGION);
entity_three->set_label(kFakeEntityLabel3);
// Add fake video human presence detection.
mri::VideoHumanPresenceDetection* detection =
frame_perception->mutable_video_human_presence_detection();
detection->set_human_presence_likelihood(0.1);
detection->set_motion_detected_likelihood(0.2);
detection->set_light_condition(mri::VideoHumanPresenceDetection::BLACK_FRAME);
detection->set_light_condition_likelihood(0.3);
}
void ValidateFramePerceptionResult(
......@@ -178,6 +209,54 @@ void ValidateFramePerceptionResult(
EXPECT_EQ(*entity_result_three.entity_label, kFakeEntityLabel3);
EXPECT_EQ(entity_result_three.type,
media_perception::ENTITY_TYPE_LABELED_REGION);
// Validate video human presence detection.
const media_perception::VideoHumanPresenceDetection* detection_result =
frame_perception_result.video_human_presence_detection.get();
ASSERT_TRUE(detection_result->human_presence_likelihood);
EXPECT_EQ(*detection_result->human_presence_likelihood, 0.1);
ASSERT_TRUE(detection_result->motion_detected_likelihood);
EXPECT_EQ(*detection_result->motion_detected_likelihood, 0.2);
EXPECT_EQ(detection_result->light_condition,
media_perception::LIGHT_CONDITION_BLACK_FRAME);
ASSERT_TRUE(detection_result->light_condition_likelihood);
EXPECT_EQ(*detection_result->light_condition_likelihood, 0.3);
}
void ValidateAudioPerceptionResult(
const media_perception::AudioPerception& audio_perception_result) {
ASSERT_TRUE(audio_perception_result.timestamp_us);
EXPECT_EQ(*audio_perception_result.timestamp_us, 10086);
// Validate audio localization.
const media_perception::AudioLocalization* audio_localization =
audio_perception_result.audio_localization.get();
ASSERT_TRUE(audio_localization);
ASSERT_TRUE(audio_localization->azimuth_radians);
EXPECT_EQ(*audio_localization->azimuth_radians, 1.5);
ASSERT_EQ(2u, audio_localization->azimuth_scores->size());
EXPECT_EQ(audio_localization->azimuth_scores->at(0), 2.0);
EXPECT_EQ(audio_localization->azimuth_scores->at(1), 5.0);
// Validate audio human presence detection.
const media_perception::AudioHumanPresenceDetection* presence_detection =
audio_perception_result.audio_human_presence_detection.get();
ASSERT_TRUE(presence_detection);
ASSERT_TRUE(presence_detection->human_presence_likelihood);
EXPECT_EQ(*presence_detection->human_presence_likelihood, 0.4);
const media_perception::AudioSpectrogram* noise_spectrogram =
presence_detection->noise_spectrogram.get();
ASSERT_TRUE(noise_spectrogram);
ASSERT_EQ(2u, noise_spectrogram->values->size());
EXPECT_EQ(noise_spectrogram->values->at(0), 0.1);
EXPECT_EQ(noise_spectrogram->values->at(1), 0.2);
const media_perception::AudioSpectrogram* frame_spectrogram =
presence_detection->frame_spectrogram.get();
ASSERT_TRUE(frame_spectrogram);
ASSERT_EQ(1u, frame_spectrogram->values->size());
EXPECT_EQ(frame_spectrogram->values->at(0), 0.3);
}
void InitializeFakeImageFrameData(mri::ImageFrame* image_frame) {
......@@ -214,13 +293,20 @@ TEST(MediaPerceptionConversionUtilsTest, MediaPerceptionProtoToIdl) {
mri::FramePerception* frame_perception =
media_perception.add_frame_perception();
InitializeFakeFramePerception(kFrameId, frame_perception);
mri::AudioPerception* audio_perception =
media_perception.add_audio_perception();
InitializeFakeAudioPerception(audio_perception);
media_perception::MediaPerception media_perception_result =
media_perception::MediaPerceptionProtoToIdl(media_perception);
EXPECT_EQ(*media_perception_result.timestamp, 1);
ASSERT_TRUE(media_perception_result.frame_perceptions);
ASSERT_EQ(1u, media_perception_result.frame_perceptions->size());
ASSERT_TRUE(media_perception_result.audio_perceptions);
ASSERT_EQ(1u, media_perception_result.audio_perceptions->size());
ValidateFramePerceptionResult(
kFrameId, media_perception_result.frame_perceptions->at(0));
ValidateAudioPerceptionResult(
media_perception_result.audio_perceptions->at(0));
}
TEST(MediaPerceptionConversionUtilsTest, DiagnosticsProtoToIdl) {
......
......@@ -192,6 +192,47 @@ namespace mediaPerceptionPrivate {
long? latencyUsec;
};
// Type of lighting conditions.
enum LightCondition {
UNSPECIFIED,
// No noticeable change occurred.
NO_CHANGE,
// Light was switched on in the room.
TURNED_ON,
// Light was switched off in the room.
TURNED_OFF,
// Light gradually got dimmer (for example, due to a sunset).
DIMMER,
// Light gradually got brighter (for example, due to a sunrise).
BRIGHTER,
// Black frame was detected - the current frame contains only noise.
BLACK_FRAME
};
// Detection of human presence close to the camera.
dictionary VideoHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present in
// the video frame.
double? humanPresenceLikelihood;
// Indicates a probability in [0, 1] that motion has been detected in the
// video frame.
double? motionDetectedLikelihood;
// Indicates lighting condition in the video frame.
LightCondition? lightCondition;
// Indicates a probablity in [0, 1] interval that
// <code>lightCondition</code> value is correct.
double? lightConditionLikelihood;
};
// The set of computer vision metadata for an image frame.
dictionary FramePerception {
long? frameId;
......@@ -208,6 +249,53 @@ namespace mediaPerceptionPrivate {
// Processing latency for a list of packets.
PacketLatency[]? packetLatency;
// Human presence detection results for a video frame.
VideoHumanPresenceDetection? videoHumanPresenceDetection;
};
// An estimate of the direction that the sound is coming from.
dictionary AudioLocalization {
// An angle in radians in the horizontal plane. It roughly points to the
// peak in the probability distribution of azimuth defined below.
double? azimuthRadians;
// A probability distribution for the current snapshot in time that shows
// the likelihood of a sound source being at a particular azimuth. For
// example, <code>azimuthScores = [0.1, 0.2, 0.3, 0.4]</code> means that
// the probability that the sound is coming from an azimuth of 0, pi/2, pi,
// 3*pi/2 is 0.1, 0.2, 0.3 and 0.4, respectively.
double[]? azimuthScores;
};
// Spectrogram of an audio frame.
dictionary AudioSpectrogram {
double[]? values;
};
// Detection of human presence close to the microphone.
dictionary AudioHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human has caused a
// sound close to the microphone.
double? humanPresenceLikelihood;
// Estimate of the noise spectrogram.
AudioSpectrogram? noiseSpectrogram;
// Spectrogram of an audio frame.
AudioSpectrogram? frameSpectrogram;
};
// Audio perception results for an audio frame.
dictionary AudioPerception {
// A timestamp in microseconds attached when this message was generated.
double? timestampUs;
// Audio localization results for an audio frame.
AudioLocalization? audioLocalization;
// Audio human presence detection results for an audio frame.
AudioHumanPresenceDetection? audioHumanPresenceDetection;
};
dictionary MediaPerception {
......@@ -219,6 +307,9 @@ namespace mediaPerceptionPrivate {
// An array of framePerceptions.
FramePerception[]? framePerceptions;
// An array of audio perceptions.
AudioPerception[]? audioPerceptions;
};
enum ImageFormat {
......@@ -247,6 +338,9 @@ namespace mediaPerceptionPrivate {
// The image frame data for the associated FramePerception object.
ImageFrame? imageFrame;
// The audio perception results for an audio frame.
AudioPerception? audioPerception;
};
dictionary Diagnostics {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment