Unit test to cross check serialized data against complete data.

(1) Adds an optional test to check serialized data against complete data. (2) Add script to move data from a single arbitrary folder to the correct locations in chromium. (3) Update data to latest version. (4) Fix serialized data to C script so that bytes to uint32 conversion is done manually. Bug: 850947 Change-Id: I84d5a6e34a99eee60c973525f2c8358c583d3a87 Reviewed-on: https://chromium-review.googlesource.com/c/1383351Reviewed-by: anthonyvd <anthonyvd@chromium.org> Commit-Queue: Alexandre Frechette <frechette@chromium.org> Cr-Commit-Position: refs/heads/master@{#622095}

Unit test to cross check serialized data against complete data.
(1) Adds an optional test to check serialized data against complete data. (2) Add script to move data from a single arbitrary folder to the correct locations in chromium. (3) Update data to latest version. (4) Fix serialized data to C script so that bytes to uint32 conversion is done manually. Bug: 850947 Change-Id: I84d5a6e34a99eee60c973525f2c8358c583d3a87 Reviewed-on: https://chromium-review.googlesource.com/c/1383351Reviewed-by: anthonyvd <anthonyvd@chromium.org> Commit-Queue: Alexandre Frechette <frechette@chromium.org> Cr-Commit-Position: refs/heads/master@{#622095}
7fda0168 · Alexandre Frechette · Commit Bot · 561141eb · 7fda0168 · 7fda0168
Commit 7fda0168 authored Jan 11, 2019 by Alexandre Frechette Committed by Commit Bot Jan 11, 2019
6 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -171,6 +171,7 @@ vs-chromium-project.txt
 /components/resources/default_100_percent/google_chrome
 /components/resources/default_200_percent/google_chrome
 /components/search_engines/prepopulated_engines.xml
+/components/test/data/language/
 /components/suggestions.xml
 /components/variations.xml
 /components/zucchini/testdata/*.exe

--- a/components/language/content/browser/ulp_language_code_locator/BUILD.gn
+++ b/components/language/content/browser/ulp_language_code_locator/BUILD.gn
@@ -3,6 +3,7 @@
 # found in the LICENSE file.
 import("//build/config/compiler/compiler.gni")
+import("//testing/test.gni")
 action("ulp_serialized_to_static_c") {
  script = "ulp_serialized_to_static_c.py"
@@ -64,3 +65,20 @@ source_set("unit_tests") {
    "//third_party/s2cellid",
  ]
 }
+test("data_tests") {
+  sources = [
+    "//components/test/run_all_unittests.cc",
+    "ulp_language_code_locator_datatest.cc",
+  ]
+  deps = [
+    ":s2langquadtree",
+    ":ulp_language_code_locator",
+    "//base",
+    "//base/test:test_support",
+    "//components/test:test_support",
+    "//testing/gmock",
+    "//testing/gtest",
+    "//third_party/s2cellid",
+  ]
+}
--- a/components/language/content/browser/ulp_language_code_locator/geolanguage-data_rank0.txt
+++ b/components/language/content/browser/ulp_language_code_locator/geolanguage-data_rank0.txt
--- a/components/language/content/browser/ulp_language_code_locator/ulp_language_code_locator_datatest.cc
+++ b/components/language/content/browser/ulp_language_code_locator/ulp_language_code_locator_datatest.cc
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <map>
+#include <string>
+#include <vector>
+#include "base/files/file_util.h"
+#include "base/logging.h"
+#include "base/path_service.h"
+#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_split.h"
+#include "components/language/content/browser/ulp_language_code_locator/s2langquadtree.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "third_party/s2cellid/src/s2/s2cellid.h"
+#include "third_party/s2cellid/src/s2/s2latlng.h"
+namespace language {
+namespace {
+#include "components/language/content/browser/ulp_language_code_locator/ulp_language_code_locator_helper.h"
+}  // namespace
+const std::map<S2LatLng, std::string> GetData() {
+  std::map<S2LatLng, std::string> latlng_to_lang;
+  std::string data;
+  base::FilePath source_dir;
+  CHECK(base::PathService::Get(base::DIR_SOURCE_ROOT, &source_dir));
+  base::FilePath data_dir =
+      source_dir.AppendASCII("components/test/data/language/");
+  base::FilePath data_filepath =
+      data_dir.AppendASCII("celltolang-data_rank0.csv");
+  if (!base::ReadFileToString(data_filepath, &data))
+    LOG(FATAL) << "Could not read data from `" << data_filepath << "`.";
+  std::vector<std::string> lines = base::SplitString(
+      data, "\n", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    // TODO(frechette) Remove once we ensured no empty line in data file.
+    if (lines[i].empty())
+      continue;
+    std::vector<std::string> fields = base::SplitString(
+        lines[i], ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
+    CHECK_EQ(3u, fields.size());
+    double lat, lng;
+    CHECK(base::StringToDouble(fields[0], &lat));
+    CHECK(base::StringToDouble(fields[1], &lng));
+    latlng_to_lang[S2LatLng::FromDegrees(lat, lng)] = fields[2];
+  }
+  return latlng_to_lang;
+}
+TEST(UlpLanguageCodeLocatorDataTest, TreeContainsData) {
+  const S2LangQuadTreeNode root =
+      S2LangQuadTreeNode::Deserialize(GetLanguages(), GetTreeSerialized());
+  const std::map<S2LatLng, std::string> data = GetData();
+  for (const auto& latlng_lang : data) {
+    S2CellId cell(latlng_lang.first);
+    EXPECT_EQ(latlng_lang.second, root.Get(cell));
+  }
+}
+}  // namespace language
--- a/components/language/content/browser/ulp_language_code_locator/ulp_serialized_to_static_c.py
+++ b/components/language/content/browser/ulp_language_code_locator/ulp_serialized_to_static_c.py
@@ -5,11 +5,8 @@
 """Generate c++ structure containing serialized ULP language quad tree"""
 import argparse
-import csv
 import os.path
-import string
 import sys
-import array
 sys.path.insert(1,
    os.path.join(os.path.dirname(__file__),
@@ -24,15 +21,21 @@ import jinja2 # pylint: disable=F0401
 def ReadSerializedData(input_path):
  """Read serialized ULP language quad tree"""
-  with open(input_path) as input_file:
+  with open(input_path, 'rb') as input_file:
    data = input_file.read()
  linebreak = data.index('\n')
  # First line is comma-separated list of languages.
  language_codes = data[:linebreak].strip().split(',')
-  # Rest of the file is the serialized tree. We read the bits as 32 bits,
+  # Rest of the file is the serialized tree.
-  # unsigned int words.
+  tree_bytes = data[linebreak+1:]
-  tree_serialized = array.array('I', data[linebreak+1:])
+  # We group the bytes in the string into 32 bits integers.
-  assert tree_serialized.itemsize == 4, "Items must be 4 bytes ints."
+  tree_serialized = [
+    sum((ord(tree_bytes[i+b]) << (8*b)) if i+b < len(tree_bytes) else 0
+    for b in xrange(4))
+    for i in xrange(0, len(tree_bytes), 4)
+  ]
  return tree_serialized, language_codes

--- a/components/language/content/browser/ulp_language_code_locator/update_data.sh
+++ b/components/language/content/browser/ulp_language_code_locator/update_data.sh
+#!/bin/bash
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# This script copies ULP language code locator data from a single folder
+# to the right location in the Chromium source.
+DIR=$1
+if [ ! -d $DIR ]; then
+  echo "First argument ${DIR} is not an existing directory."
+  return
+fi
+if [ ! -d components/test/data/language ]; then
+  echo "Making components/test/data/language"
+  mkdir components/test/data/language
+fi
+for i in `seq 0 2`;
+do
+  cp ${DIR}/geolanguage-data_rank$i.txt \
+    components/language/content/browser/ulp_language_code_locator/
+  cp ${DIR}/celltolang-data_rank$i.csv components/test/data/language/
+done