Commit e2b61933 authored by Chris Hall's avatar Chris Hall Committed by Commit Bot

Language detection: Do not cache inherited languages.

Caching inherited languages in language info was a premature optimization
which makes support for dynamic detection more complex.

This change simplifies the `Label` step. Label will no longer consider
language inheritance, inheritance is handled entirely within
AXNode::GetLanguage.

After later profiling we can consider caching the GetLanguage result
separately if that ended up being a concern.

R=dmazzoni,akihiroota

Change-Id: Idd6503fbc78b3b561cb1b418dca6378bedc7ea79
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1949807
Commit-Queue: Chris Hall <chrishall@chromium.org>
Reviewed-by: default avatarDominic Mazzoni <dmazzoni@chromium.org>
Reviewed-by: default avatarAlice Boxhall <aboxhall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#722301}
parent 35bc20ce
...@@ -125,7 +125,6 @@ void AXLanguageDetectionManager::DetectLanguageForSubtreeInternal( ...@@ -125,7 +125,6 @@ void AXLanguageDetectionManager::DetectLanguageForSubtreeInternal(
// destruction of the containing node, this is due to us treating AXNode // destruction of the containing node, this is due to us treating AXNode
// as otherwise read-only and so we store any detected language // as otherwise read-only and so we store any detected language
// information on lang info. // information on lang info.
node->SetLanguageInfo(std::make_unique<AXLanguageInfo>()); node->SetLanguageInfo(std::make_unique<AXLanguageInfo>());
lang_info = node->GetLanguageInfo(); lang_info = node->GetLanguageInfo();
} else { } else {
...@@ -137,6 +136,10 @@ void AXLanguageDetectionManager::DetectLanguageForSubtreeInternal( ...@@ -137,6 +136,10 @@ void AXLanguageDetectionManager::DetectLanguageForSubtreeInternal(
// concatenation and bubbling up results. // concatenation and bubbling up results.
auto text = node->GetStringAttribute(ax::mojom::StringAttribute::kName); auto text = node->GetStringAttribute(ax::mojom::StringAttribute::kName);
// FindTopNMostFreqLangs will pad the results with
// NNetLanguageIdentifier::kUnknown in order to reach the requested number
// of languages, this means we cannot rely on the results' length and we
// have to filter the results.
const auto results = language_identifier_.FindTopNMostFreqLangs( const auto results = language_identifier_.FindTopNMostFreqLangs(
text, kMaxDetectedLanguagesPerSpan); text, kMaxDetectedLanguagesPerSpan);
...@@ -184,6 +187,12 @@ void AXLanguageDetectionManager::LabelLanguageForSubtreeInternal(AXNode* node) { ...@@ -184,6 +187,12 @@ void AXLanguageDetectionManager::LabelLanguageForSubtreeInternal(AXNode* node) {
// If the lang_info->language is already set then we have no more work to do // If the lang_info->language is already set then we have no more work to do
// for this node. // for this node.
if (lang_info && lang_info->language.empty()) { if (lang_info && lang_info->language.empty()) {
// We assign the highest probability language which is both:
// 1) reliably detected for this node, and
// 2) one of the top (kMaxDetectedLanguagesPerPage) languages on this page.
//
// This helps guard against false positives for nodes which have noisy
// language detection results in isolation.
for (const auto& lang : lang_info->detected_languages) { for (const auto& lang : lang_info->detected_languages) {
if (lang_info_stats_.CheckLanguageWithinTop(lang)) { if (lang_info_stats_.CheckLanguageWithinTop(lang)) {
lang_info->language = lang; lang_info->language = lang;
...@@ -191,43 +200,20 @@ void AXLanguageDetectionManager::LabelLanguageForSubtreeInternal(AXNode* node) { ...@@ -191,43 +200,20 @@ void AXLanguageDetectionManager::LabelLanguageForSubtreeInternal(AXNode* node) {
} }
} }
// TODO(chrishall): consider obeying the author declared lang tag in some // After attempting labelling we no longer need the detected results in
// cases, either based on proximity or based on common language detection // LanguageInfo, as they have no future use.
// error cases.
// If language is still empty then we failed to detect a language from
// this node, we will instead try construct a language from other sources
// including any lang attribute and any language from the parent tree.
if (lang_info->language.empty()) { if (lang_info->language.empty()) {
const auto& lang_attr = // If no language was assigned then LanguageInfo as a whole can safely be
node->GetStringAttribute(ax::mojom::StringAttribute::kLanguage); // destroyed.
if (!lang_attr.empty()) { node->ClearLanguageInfo();
lang_info->language = lang_attr; } else {
} else { // Otherwise, if we assigned a language then we need to keep
// We call GetLanguage() on our parent which will return a detected // LanguageInfo.language, but we can clear the detected results.
// language if it has one, otherwise it will search up the tree for a lang_info->detected_languages.clear();
// kLanguage attribute.
//
// This means that lang attributes are inherited indefinitely but
// detected language is only inherited one level.
//
// Currently we only attach detected language to text nodes, once we
// start attaching detected language on other nodes we need to rethink
// this. We may want to attach detected language information once we
// consider combining multiple smaller text nodes into one larger one.
//
// TODO(chrishall): reconsider detected language inheritance.
AXNode* parent = node->parent();
if (parent) {
const auto& parent_lang = parent->GetLanguage();
if (!parent_lang.empty()) {
lang_info->language = parent_lang;
}
}
}
} }
} }
// Recurse into children to continue labelling.
for (AXNode* child : node->children()) { for (AXNode* child : node->children()) {
LabelLanguageForSubtreeInternal(child); LabelLanguageForSubtreeInternal(child);
} }
......
...@@ -280,19 +280,22 @@ TEST(AXLanguageDetectionTest, LangAttrInheritanceFeatureFlagOn) { ...@@ -280,19 +280,22 @@ TEST(AXLanguageDetectionTest, LangAttrInheritanceFeatureFlagOn) {
{ {
AXNode* node3 = tree.GetFromId(3); AXNode* node3 = tree.GetFromId(3);
EXPECT_NE(node3->GetLanguageInfo(), nullptr); // Inherited languages are not stored in lang info.
EXPECT_EQ(node3->GetLanguageInfo(), nullptr);
EXPECT_EQ(node3->GetLanguage(), "en"); EXPECT_EQ(node3->GetLanguage(), "en");
} }
{ {
AXNode* node4 = tree.GetFromId(4); AXNode* node4 = tree.GetFromId(4);
EXPECT_NE(node4->GetLanguageInfo(), nullptr); // Inherited languages are not stored in lang info.
EXPECT_EQ(node4->GetLanguageInfo(), nullptr);
EXPECT_EQ(node4->GetLanguage(), "fr"); EXPECT_EQ(node4->GetLanguage(), "fr");
} }
{ {
AXNode* node5 = tree.GetFromId(5); AXNode* node5 = tree.GetFromId(5);
EXPECT_NE(node5->GetLanguageInfo(), nullptr); // Inherited languages are not stored in lang info.
EXPECT_EQ(node5->GetLanguageInfo(), nullptr);
EXPECT_EQ(node5->GetLanguage(), "fr"); EXPECT_EQ(node5->GetLanguage(), "fr");
} }
} }
...@@ -406,7 +409,8 @@ TEST(AXLanguageDetectionTest, LanguageDetectionBasic) { ...@@ -406,7 +409,8 @@ TEST(AXLanguageDetectionTest, LanguageDetectionBasic) {
{ {
AXNode* node5 = tree.GetFromId(5); AXNode* node5 = tree.GetFromId(5);
EXPECT_TRUE(node5->IsText()); EXPECT_TRUE(node5->IsText());
EXPECT_NE(node5->GetLanguageInfo(), nullptr); // Inherited languages are not stored in lang info.
EXPECT_EQ(node5->GetLanguageInfo(), nullptr);
EXPECT_EQ(node5->GetLanguage(), "en"); EXPECT_EQ(node5->GetLanguage(), "en");
} }
} }
...@@ -528,8 +532,9 @@ TEST(AXLanguageDetectionTest, LanguageDetectionDetectOnly) { ...@@ -528,8 +532,9 @@ TEST(AXLanguageDetectionTest, LanguageDetectionDetectOnly) {
{ {
AXNode* node5 = tree.GetFromId(5); AXNode* node5 = tree.GetFromId(5);
EXPECT_TRUE(node5->IsText()); EXPECT_TRUE(node5->IsText());
// Inherited languages are not stored in lang info, but removal of empty
// LanguageInfo(s) occurs during labelling. So we expect this to be empty.
ASSERT_NE(node5->GetLanguageInfo(), nullptr); ASSERT_NE(node5->GetLanguageInfo(), nullptr);
ASSERT_EQ(node5->GetLanguageInfo()->detected_languages.size(), (unsigned)0);
EXPECT_TRUE(node5->GetLanguageInfo()->language.empty()); EXPECT_TRUE(node5->GetLanguageInfo()->language.empty());
EXPECT_EQ(node5->GetLanguage(), "fr"); EXPECT_EQ(node5->GetLanguage(), "fr");
} }
......
...@@ -314,7 +314,7 @@ base::string16 AXNode::GetInheritedString16Attribute( ...@@ -314,7 +314,7 @@ base::string16 AXNode::GetInheritedString16Attribute(
return base::UTF8ToUTF16(GetInheritedStringAttribute(attribute)); return base::UTF8ToUTF16(GetInheritedStringAttribute(attribute));
} }
AXLanguageInfo* AXNode::GetLanguageInfo() { AXLanguageInfo* AXNode::GetLanguageInfo() const {
return language_info_.get(); return language_info_.get();
} }
...@@ -322,16 +322,27 @@ void AXNode::SetLanguageInfo(std::unique_ptr<AXLanguageInfo> lang_info) { ...@@ -322,16 +322,27 @@ void AXNode::SetLanguageInfo(std::unique_ptr<AXLanguageInfo> lang_info) {
language_info_ = std::move(lang_info); language_info_ = std::move(lang_info);
} }
void AXNode::ClearLanguageInfo() {
language_info_.reset();
}
std::string AXNode::GetLanguage() { std::string AXNode::GetLanguage() {
// If we have been labelled with language info then rely on that. // Walk up tree considering both detected and author declared languages.
const AXLanguageInfo* lang_info = GetLanguageInfo(); for (const AXNode* cur = this; cur; cur = cur->parent()) {
if (lang_info && !lang_info->language.empty()) // If language detection has assigned a language then we prefer that.
return lang_info->language; const AXLanguageInfo* lang_info = cur->GetLanguageInfo();
if (lang_info && !lang_info->language.empty()) {
// Otherwise fallback to kLanguage attribute. return lang_info->language;
const auto& lang_attr = }
GetInheritedStringAttribute(ax::mojom::StringAttribute::kLanguage);
return lang_attr; // If the page author has declared a language attribute we fallback to that.
const AXNodeData& data = cur->data();
if (data.HasStringAttribute(ax::mojom::StringAttribute::kLanguage)) {
return data.GetStringAttribute(ax::mojom::StringAttribute::kLanguage);
}
}
return base::EmptyString();
} }
std::ostream& operator<<(std::ostream& stream, const AXNode& node) { std::ostream& operator<<(std::ostream& stream, const AXNode& node) {
......
...@@ -356,19 +356,21 @@ class AX_EXPORT AXNode final { ...@@ -356,19 +356,21 @@ class AX_EXPORT AXNode final {
bool IsCellOrHeaderOfARIATable() const; bool IsCellOrHeaderOfARIATable() const;
bool IsCellOrHeaderOfARIAGrid() const; bool IsCellOrHeaderOfARIAGrid() const;
// Return an object containing information about the languages used. // Return an object containing information about the languages detected on
// this node.
// Callers should not retain this pointer, instead they should request it // Callers should not retain this pointer, instead they should request it
// every time it is needed. // every time it is needed.
// //
// Clients likely want to use GetLanguage instead.
//
// Returns nullptr if the node has no language info. // Returns nullptr if the node has no language info.
AXLanguageInfo* GetLanguageInfo(); AXLanguageInfo* GetLanguageInfo() const;
// This should only be called by the LabelLanguageForSubtree and is used as // This should only be called by LabelLanguageForSubtree and is used as part
// part of the language detection feature. // of the language detection feature.
void SetLanguageInfo(std::unique_ptr<AXLanguageInfo> lang_info); void SetLanguageInfo(std::unique_ptr<AXLanguageInfo> lang_info);
// Destroy the language info for this node.
void ClearLanguageInfo();
// Returns true if node has ignored state or ignored role. // Returns true if node has ignored state or ignored role.
bool IsIgnored() const; bool IsIgnored() const;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment