Commit 9e76b8ea authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

[Simplified domains] Update brand list and improve tooling.

This CL makes a small tweak to the common word list following
discussions. It also makes it slightly easier to import changes by
automating the regex generation step.

Fixed: 1117241
Change-Id: Ie57db990a8898f42a46f2f323897038595b83242
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2363307Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/master@{#799208}
parent 460dc52d
acer
adidas
amd
aol
asus
att
bbc
bloomberg
cbs
chevrolet
cisco
cnet
cnn
dell
deviantart
directv
disney
ebay
espn
expedia
fda
flickr
forbes
gamespot
garmin
google
harvard
hilton
honda
hyatt
ibm
ieee
ign
irs
kijiji
logitech
marriott
marvel
microsoft
mit
mlb
motorola
mozilla
msn
mtv
myspace
nasa
nba
nbc
ncaa
nfl
nhl
nike
nintendo
nokia
norton
nvidia
panasonic
paypal
pbs
playstation
reuters
samsung
sears
skype
sony
thumbzilla
toyota
tripadvisor
usc
usda
usps
verizon
wikipedia
xbox
xnxx
yahoo
zdnet
^acer$
^adidas$
^aol$
^asus$
^att$
^bbc$
^bloomberg$
^chevrolet$
^cisco$
^cnet$
^cnn$
^dell$
^deviantart$
^directv$
^disney$
^ebay$
^espn$
^expedia$
^flickr$
^forbes$
^gamespot$
^garmin$
^google$
^harvard$
^hilton$
^honda$
^hyatt$
^ibm$
^ieee$
^ign$
^irs$
^logitech$
^marriott$
^marvel$
^microsoft$
^mit$
^mlb$
^motorola$
^mozilla$
^msn$
^myspace$
^mysql$
^nasa$
^nba$
^nbc$
^ncaa$
^nfl$
^nhl$
^nike$
^nintendo$
^nokia$
^norton$
^nvidia$
^panasonic$
^paypal$
^playstation$
^reuters$
^samsung$
^sears$
^skype$
^sony$
^toyota$
^tripadvisor$
^verizon$
^wikipedia$
^xbox$
^xnxx$
^yahoo$
^zdnet$
...@@ -40,6 +40,7 @@ accessed, 0 ...@@ -40,6 +40,7 @@ accessed, 0
accessibility, 0 accessibility, 0
accessible, 0 accessible, 0
accessing, 0 accessing, 0
accession, 0
accessories, 0 accessories, 0
accessory, 0 accessory, 0
accident, 0 accident, 0
...@@ -110,6 +111,7 @@ acute, 0 ...@@ -110,6 +111,7 @@ acute, 0
ada, 0 ada, 0
adam, 0 adam, 0
adams, 0 adams, 0
adapt, 0
adaptation, 0 adaptation, 0
adapted, 0 adapted, 0
adapter, 0 adapter, 0
...@@ -327,7 +329,6 @@ ambassador, 0 ...@@ -327,7 +329,6 @@ ambassador, 0
amber, 0 amber, 0
ambien, 0 ambien, 0
ambient, 0 ambient, 0
amd, 0
amend, 0 amend, 0
amended, 0 amended, 0
amendment, 0 amendment, 0
...@@ -1388,6 +1389,7 @@ carolina, 0 ...@@ -1388,6 +1389,7 @@ carolina, 0
caroline, 0 caroline, 0
carpenter, 0 carpenter, 0
carpet, 0 carpet, 0
carriage, 0
carried, 0 carried, 0
carrier, 0 carrier, 0
carriers, 0 carriers, 0
...@@ -1440,7 +1442,6 @@ causing, 0 ...@@ -1440,7 +1442,6 @@ causing, 0
caution, 0 caution, 0
cave, 0 cave, 0
cayman, 0 cayman, 0
cbs, 0
ccd, 0 ccd, 0
cdna, 0 cdna, 0
cds, 0 cds, 0
...@@ -3330,6 +3331,7 @@ exterior, 0 ...@@ -3330,6 +3331,7 @@ exterior, 0
external, 0 external, 0
extra, 0 extra, 0
extract, 0 extract, 0
extracted, 0
extraction, 0 extraction, 0
extraordinary, 0 extraordinary, 0
extras, 0 extras, 0
...@@ -3417,7 +3419,6 @@ favourites, 0 ...@@ -3417,7 +3419,6 @@ favourites, 0
fax, 0 fax, 0
fbi, 0 fbi, 0
fcc, 0 fcc, 0
fda, 0
fear, 0 fear, 0
fears, 0 fears, 0
feat, 0 feat, 0
...@@ -4934,7 +4935,6 @@ kick, 0 ...@@ -4934,7 +4935,6 @@ kick, 0
kid, 0 kid, 0
kidney, 0 kidney, 0
kids, 0 kids, 0
kijiji, 0
kill, 0 kill, 0
killed, 0 killed, 0
killer, 0 killer, 0
...@@ -5865,7 +5865,6 @@ msgid, 0 ...@@ -5865,7 +5865,6 @@ msgid, 0
msgstr, 0 msgstr, 0
msie, 0 msie, 0
mst, 0 mst, 0
mtv, 0
much, 0 much, 0
mud, 0 mud, 0
mug, 0 mug, 0
...@@ -5898,6 +5897,7 @@ myers, 0 ...@@ -5898,6 +5897,7 @@ myers, 0
myrtle, 0 myrtle, 0
myself, 0 myself, 0
mysimon, 0 mysimon, 0
mysql, 0
mysterious, 0 mysterious, 0
mystery, 0 mystery, 0
myth, 0 myth, 0
...@@ -6467,6 +6467,7 @@ patriot, 0 ...@@ -6467,6 +6467,7 @@ patriot, 0
patrol, 0 patrol, 0
pattern, 0 pattern, 0
patterns, 0 patterns, 0
patterson, 0
paul, 0 paul, 0
pavilion, 0 pavilion, 0
paxil, 0 paxil, 0
...@@ -6478,7 +6479,6 @@ payment, 0 ...@@ -6478,7 +6479,6 @@ payment, 0
payments, 0 payments, 0
payroll, 0 payroll, 0
pays, 0 pays, 0
pbs, 0
pci, 0 pci, 0
pcs, 0 pcs, 0
pct, 0 pct, 0
...@@ -7397,6 +7397,7 @@ regularly, 0 ...@@ -7397,6 +7397,7 @@ regularly, 0
regulated, 0 regulated, 0
regulation, 0 regulation, 0
regulations, 0 regulations, 0
regulator, 0
regulatory, 0 regulatory, 0
rehab, 0 rehab, 0
rehabilitation, 0 rehabilitation, 0
...@@ -7621,6 +7622,7 @@ revolution, 0 ...@@ -7621,6 +7622,7 @@ revolution, 0
revolutionary, 0 revolutionary, 0
reward, 0 reward, 0
rewards, 0 rewards, 0
rex, 0
reynolds, 0 reynolds, 0
rfc, 0 rfc, 0
rfid, 0 rfid, 0
...@@ -8906,6 +8908,7 @@ tenure, 0 ...@@ -8906,6 +8908,7 @@ tenure, 0
term, 0 term, 0
terminal, 0 terminal, 0
terminals, 0 terminals, 0
terminate, 0
terminated, 0 terminated, 0
termination, 0 termination, 0
terminology, 0 terminology, 0
...@@ -8971,6 +8974,7 @@ there, 0 ...@@ -8971,6 +8974,7 @@ there, 0
thereafter, 0 thereafter, 0
thereby, 0 thereby, 0
therefore, 0 therefore, 0
therein, 0
thereof, 0 thereof, 0
thermal, 0 thermal, 0
thesaurus, 0 thesaurus, 0
...@@ -9028,7 +9032,6 @@ thumb, 0 ...@@ -9028,7 +9032,6 @@ thumb, 0
thumbnail, 0 thumbnail, 0
thumbnails, 0 thumbnails, 0
thumbs, 0 thumbs, 0
thumbzilla, 0
thunder, 0 thunder, 0
thursday, 0 thursday, 0
thus, 0 thus, 0
...@@ -9440,9 +9443,7 @@ urw, 0 ...@@ -9440,9 +9443,7 @@ urw, 0
usa, 0 usa, 0
usage, 0 usage, 0
usb, 0 usb, 0
usc, 0
usd, 0 usd, 0
usda, 0
use, 0 use, 0
used, 0 used, 0
useful, 0 useful, 0
...@@ -9454,7 +9455,6 @@ uses, 0 ...@@ -9454,7 +9455,6 @@ uses, 0
usgs, 0 usgs, 0
usher, 0 usher, 0
using, 0 using, 0
usps, 0
usr, 0 usr, 0
usual, 0 usual, 0
usually, 0 usually, 0
......
...@@ -4,8 +4,40 @@ ...@@ -4,8 +4,40 @@
# found in the LICENSE file. # found in the LICENSE file.
# #
# This script generates common_words.gperf. See README.md for more info. # This script generates common_words.gperf. See README.md for more info.
awk 'length($1) > 2 {print $1}' count_1w.txt \
| grep -v -f brands_in_common_words_regex.list \ # Where to download the full wordlist if needed.
FULL_WORDLIST_URL=https://norvig.com/ngrams/count_1w.txt
# Where the wordlist is, or should be, stored on disk.
FULL_WORDLIST_PATH=${FULL_WORDLIST_PATH:-count_1w.txt}
# Where the list of brands found in the common word list is found.
BRAND_WORDLIST=${BRAND_WORDLIST:-brands_in_common_words.list}
# Where to store the output file.
OUTPUT_PATH=${OUTPUT_PATH:-common_words.gperf}
set -e
if [ ! -e $FULL_WORDLIST_PATH ]; then
echo "= Fetching wordlist"
wget -q -O $FULL_WORDLIST_PATH $FULL_WORDLIST_URL
USING_TEMPORARY_WORDLIST=1
else
echo "= Using provided wordlist"
fi
echo "= Generating regular expressions"
REGEX_TMPFILE=$(mktemp)
sed 's/^/^/; s/$/$/' $BRAND_WORDLIST > $REGEX_TMPFILE
echo "= Generating gperf list"
awk 'length($1) > 2 {print $1}' $FULL_WORDLIST_PATH \
| grep -v -f $REGEX_TMPFILE \
| head -n 10000 | sort \ | head -n 10000 | sort \
| awk 'BEGIN { print "%%" } { print $0", 0" } END { print "%%" }' \ | awk 'BEGIN { print "%%" } { print $0", 0" } END { print "%%" }' \
> common_words.gperf > $OUTPUT_PATH
echo "= Cleaning up"
rm $REGEX_TMPFILE
[ $USING_TEMPORARY_WORDLIST ] && rm $FULL_WORDLIST_PATH
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment