Commit 4cac9b11 authored by sergeyv's avatar sergeyv Committed by Commit bot

Revert of Add ETC1 powered SSE encoder for tile texture compression (patchset...

Revert of Add ETC1 powered SSE encoder for tile texture compression (patchset #23 id:440001 of https://codereview.chromium.org/1096703002/)

Reason for revert:
Speculative revert. Looks like this change breaks compilation on win8 GN:
http://build.chromium.org/p/chromium.win/builders/Win8%20GN

Failure Example:
http://build.chromium.org/p/chromium.win/builders/Win8%20GN/builds/7283

Original issue's description:
> Add ETC1 powered SSE encoder for tile texture compression
>
> Created an ETC1 encoder that uses SSE2 to improve compression speed.
> The SSE encoder extends TextureCompressor and uses the same algorithm as TextureCompressorETC1.
>
> Added unittest for TextureCompressorETC1.
>
> Moved shared code into a etc1 header.
>
> Added new performance test scenarios.
>
> Performance difference on Ubuntu x64, Haswell Processor:
> Without SSE:
> *RESULT Compress256x256BlackAndWhiteGradientImage: ETC1 Low= 1.966321587562561 us
> *RESULT Compress256x256SolidBlackImage: ETC1 Low= .0956009104847908 us
> *RESULT Compress256x256SolidColorImage: ETC1 Low= .4367307722568512 us
> *RESULT Compress256x256RandomColorImage: ETC1 Low= 5.948055744171143 us
>
> With SSE:
> *RESULT Compress256x256BlackAndWhiteGradientImage: ETC1 Low= 1.0316201448440552 us
> *RESULT Compress256x256SolidBlackImage: ETC1 Low= .25716209411621094 us
> *RESULT Compress256x256SolidColorImage: ETC1 Low= .2768038809299469 us
> *RESULT Compress256x256RandomColorImage: ETC1 Low= 1.834145426750183 us
>
> BUG=434699
> TEST=newly added unittest TextureCompressorETC1Test::Compress256x256CreateETC1, TextureCompressorETC1Test::Compress256x256RatioETC1
>
> Committed: https://crrev.com/5f3849aa8307399b7e6dfe5665ed149594244077
> Cr-Commit-Position: refs/heads/master@{#329840}

TBR=reveman@chromium.org,christiank@opera.com,jie.a.chen@intel.com,robert.bradford@intel.com,radu.velea@intel.com
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=434699

Review URL: https://codereview.chromium.org/1136083003

Cr-Commit-Position: refs/heads/master@{#329845}
parent 845f198c
......@@ -421,7 +421,6 @@ Qiankun Miao <qiankun.miao@intel.com>
Qing Zhang <qing.zhang@intel.com>
Qi Yang <qi1988.yang@samsung.com>
Radu Stavila <stavila@adobe.com>
Radu Velea <radu.velea@intel.com>
Rafael Antognolli <rafael.antognolli@intel.com>
Raghavendra Ghatage <r.ghatage@samsung.com>
Rahul Gupta <rahul.g@samsung.com>
......
......@@ -540,7 +540,6 @@ component("cc") {
deps = [
"//base",
"//base/third_party/dynamic_annotations",
"//cc:cc_opts",
"//cc/surfaces:surface_id",
"//gpu",
"//gpu/command_buffer/client:gles2_interface",
......@@ -558,36 +557,6 @@ component("cc") {
}
}
source_set("cc_opts") {
public_deps = [
"//cc:cc_opts_sse",
]
}
source_set("cc_opts_sse") {
if (target_cpu == "ia32" || target_cpu == "x64") {
deps = [
"//base",
]
defines = [ "CC_IMPLEMENTATION=1" ]
if (!is_debug && (is_win || is_android)) {
configs -= [ "//build/config/compiler:optimize" ]
configs += [ "//build/config/compiler:optimize_max" ]
}
sources = [
"resources/texture_compressor.h",
"resources/texture_compressor_etc1.h",
"resources/texture_compressor_etc1_sse.cc",
"resources/texture_compressor_etc1_sse.h",
]
cflags = [ "-msse2" ]
}
}
source_set("test_support") {
testonly = true
sources = [
......@@ -842,7 +811,6 @@ test("cc_unittests") {
"resources/scoped_gpu_raster_unittest.cc",
"resources/scoped_resource_unittest.cc",
"resources/task_graph_runner_unittest.cc",
"resources/texture_compressor_etc1_unittest.cc",
"resources/texture_mailbox_deleter_unittest.cc",
"resources/texture_uploader_unittest.cc",
"resources/tile_manager_unittest.cc",
......
......@@ -20,7 +20,6 @@
'<(DEPTH)/ui/events/events.gyp:events_base',
'<(DEPTH)/ui/gfx/gfx.gyp:gfx',
'<(DEPTH)/ui/gfx/gfx.gyp:gfx_geometry',
'cc_opts',
],
'variables': {
'optimize': 'max',
......@@ -633,41 +632,5 @@
'../build/android/increase_size_for_speed.gypi',
],
},
{
'target_name': 'cc_opts',
'type': 'static_library',
'conditions': [
['target_arch == "ia32" or target_arch == "x64"', {
'defines': [
'CC_IMPLEMENTATION=1',
],
'dependencies': [
'cc_opts_sse',
]
}],
],
},
{
'target_name': 'cc_opts_sse',
'type': 'static_library',
'dependencies': [
'<(DEPTH)/base/base.gyp:base',
],
'conditions': [
['target_arch == "ia32" or target_arch == "x64"', {
'defines': [
'CC_IMPLEMENTATION=1',
],
'sources': [
# Conditional compilation for SSE2 code on x86 and x64 machines
'resources/texture_compressor_etc1_sse.cc',
'resources/texture_compressor_etc1_sse.h',
],
'cflags': [
'-msse2',
],
}],
],
},
],
}
......@@ -93,7 +93,6 @@
'resources/scoped_gpu_raster_unittest.cc',
'resources/scoped_resource_unittest.cc',
'resources/task_graph_runner_unittest.cc',
'resources/texture_compressor_etc1_unittest.cc',
'resources/texture_mailbox_deleter_unittest.cc',
'resources/texture_uploader_unittest.cc',
'resources/tile_manager_unittest.cc',
......
......@@ -7,25 +7,13 @@
#include "base/logging.h"
#include "cc/resources/texture_compressor_etc1.h"
#if defined(ARCH_CPU_X86_FAMILY)
#include "base/cpu.h"
#include "cc/resources/texture_compressor_etc1_sse.h"
#endif
namespace cc {
scoped_ptr<TextureCompressor> TextureCompressor::Create(Format format) {
switch (format) {
case kFormatETC1: {
#if defined(ARCH_CPU_X86_FAMILY)
base::CPU cpu;
if (cpu.has_sse2()) {
return make_scoped_ptr(new TextureCompressorETC1SSE());
}
#endif
case kFormatETC1:
return make_scoped_ptr(new TextureCompressorETC1());
}
}
NOTREACHED();
return nullptr;
......
......@@ -18,11 +18,180 @@
// performance hit.
// #define USE_PERCEIVED_ERROR_METRIC
namespace cc {
namespace {
// Constructs a color from a given base color and luminance value.
template <typename T>
inline T clamp(T val, T min, T max) {
return val < min ? min : (val > max ? max : val);
}
inline uint8_t round_to_5_bits(float val) {
return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31);
}
inline uint8_t round_to_4_bits(float val) {
return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15);
}
union Color {
struct BgraColorType {
uint8_t b;
uint8_t g;
uint8_t r;
uint8_t a;
} channels;
uint8_t components[4];
uint32_t bits;
};
/*
* Codeword tables.
* See: Table 3.17.2
*/
static const int16_t g_codeword_tables[8][4] = {{-8, -2, 2, 8},
{-17, -5, 5, 17},
{-29, -9, 9, 29},
{-42, -13, 13, 42},
{-60, -18, 18, 60},
{-80, -24, 24, 80},
{-106, -33, 33, 106},
{-183, -47, 47, 183}};
/*
* Maps modifier indices to pixel index values.
* See: Table 3.17.3
*/
static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1};
/*
* The ETC1 specification index texels as follows:
*
* [a][e][i][m] [ 0][ 4][ 8][12]
* [b][f][j][n] <-> [ 1][ 5][ 9][13]
* [c][g][k][o] [ 2][ 6][10][14]
* [d][h][l][p] [ 3][ 7][11][15]
*
* However, when extracting sub blocks from BGRA data the natural array
* indexing order ends up different:
*
* vertical0: [a][e][b][f] horizontal0: [a][e][i][m]
* [c][g][d][h] [b][f][j][n]
* vertical1: [i][m][j][n] horizontal1: [c][g][k][o]
* [k][o][l][p] [d][h][l][p]
*
* In order to translate from the natural array indices in a sub block to the
* indices (number) used by specification and hardware we use this table.
*/
static const uint8_t g_idx_to_num[4][8] = {
{0, 4, 1, 5, 2, 6, 3, 7}, // Vertical block 0.
{8, 12, 9, 13, 10, 14, 11, 15}, // Vertical block 1.
{0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0.
{2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1.
};
inline void WriteColors444(uint8_t* block,
const Color& color0,
const Color& color1) {
block[0] = (color0.channels.r & 0xf0) | (color1.channels.r >> 4);
block[1] = (color0.channels.g & 0xf0) | (color1.channels.g >> 4);
block[2] = (color0.channels.b & 0xf0) | (color1.channels.b >> 4);
}
inline void WriteColors555(uint8_t* block,
const Color& color0,
const Color& color1) {
// Table for conversion to 3-bit two complement format.
static const uint8_t two_compl_trans_table[8] = {
4, // -4 (100b)
5, // -3 (101b)
6, // -2 (110b)
7, // -1 (111b)
0, // 0 (000b)
1, // 1 (001b)
2, // 2 (010b)
3, // 3 (011b)
};
int16_t delta_r =
static_cast<int16_t>(color1.channels.r >> 3) - (color0.channels.r >> 3);
int16_t delta_g =
static_cast<int16_t>(color1.channels.g >> 3) - (color0.channels.g >> 3);
int16_t delta_b =
static_cast<int16_t>(color1.channels.b >> 3) - (color0.channels.b >> 3);
DCHECK(delta_r >= -4 && delta_r <= 3);
DCHECK(delta_g >= -4 && delta_g <= 3);
DCHECK(delta_b >= -4 && delta_b <= 3);
block[0] = (color0.channels.r & 0xf8) | two_compl_trans_table[delta_r + 4];
block[1] = (color0.channels.g & 0xf8) | two_compl_trans_table[delta_g + 4];
block[2] = (color0.channels.b & 0xf8) | two_compl_trans_table[delta_b + 4];
}
inline void WriteCodewordTable(uint8_t* block,
uint8_t sub_block_id,
uint8_t table) {
DCHECK_LT(sub_block_id, 2);
DCHECK_LT(table, 8);
uint8_t shift = (2 + (3 - sub_block_id * 3));
block[3] &= ~(0x07 << shift);
block[3] |= table << shift;
}
inline void WritePixelData(uint8_t* block, uint32_t pixel_data) {
block[4] |= pixel_data >> 24;
block[5] |= (pixel_data >> 16) & 0xff;
block[6] |= (pixel_data >> 8) & 0xff;
block[7] |= pixel_data & 0xff;
}
inline void WriteFlip(uint8_t* block, bool flip) {
block[3] &= ~0x01;
block[3] |= static_cast<uint8_t>(flip);
}
inline void WriteDiff(uint8_t* block, bool diff) {
block[3] &= ~0x02;
block[3] |= static_cast<uint8_t>(diff) << 1;
}
/**
* Compress and rounds BGR888 into BGR444. The resulting BGR444 color is
* expanded to BGR888 as it would be in hardware after decompression. The
* actual 444-bit data is available in the four most significant bits of each
* channel.
*/
inline Color MakeColor444(const float* bgr) {
uint8_t b4 = round_to_4_bits(bgr[0]);
uint8_t g4 = round_to_4_bits(bgr[1]);
uint8_t r4 = round_to_4_bits(bgr[2]);
Color bgr444;
bgr444.channels.b = (b4 << 4) | b4;
bgr444.channels.g = (g4 << 4) | g4;
bgr444.channels.r = (r4 << 4) | r4;
return bgr444;
}
/**
* Compress and rounds BGR888 into BGR555. The resulting BGR555 color is
* expanded to BGR888 as it would be in hardware after decompression. The
* actual 555-bit data is available in the five most significant bits of each
* channel.
*/
inline Color MakeColor555(const float* bgr) {
uint8_t b5 = round_to_5_bits(bgr[0]);
uint8_t g5 = round_to_5_bits(bgr[1]);
uint8_t r5 = round_to_5_bits(bgr[2]);
Color bgr555;
bgr555.channels.b = (b5 << 3) | (b5 >> 2);
bgr555.channels.g = (g5 << 3) | (g5 >> 2);
bgr555.channels.r = (r5 << 3) | (r5 >> 2);
return bgr555;
}
/**
* Constructs a color from a given base color and luminance value.
*/
inline Color MakeColor(const Color& base, int16_t lum) {
int b = static_cast<int>(base.channels.b) + lum;
int g = static_cast<int>(base.channels.g) + lum;
......@@ -34,8 +203,10 @@ inline Color MakeColor(const Color& base, int16_t lum) {
return color;
}
// Calculates the error metric for two colors. A small error signals that the
// colors are similar to each other, a large error the signals the opposite.
/**
* Calculates the error metric for two colors. A small error signals that the
* colors are similar to each other, a large error the signals the opposite.
*/
inline uint32_t GetColorError(const Color& u, const Color& v) {
#ifdef USE_PERCEIVED_ERROR_METRIC
float delta_b = static_cast<float>(u.channels.b) - v.channels.b;
......@@ -290,15 +461,15 @@ void CompressBlock(uint8_t* dst, const Color* ver_src, const Color* hor_src) {
} // namespace
namespace cc {
void TextureCompressorETC1::Compress(const uint8_t* src,
uint8_t* dst,
int width,
int height,
Quality quality) {
DCHECK_GE(width, 4);
DCHECK_EQ((width & 3), 0);
DCHECK_GE(height, 4);
DCHECK_EQ((height & 3), 0);
DCHECK(width >= 4 && (width & 3) == 0);
DCHECK(height >= 4 && (height & 3) == 0);
Color ver_blocks[16];
Color hor_blocks[16];
......
......@@ -7,183 +7,8 @@
#include "cc/resources/texture_compressor.h"
#include "base/compiler_specific.h"
#include "base/logging.h"
namespace cc {
template <typename T>
inline T clamp(T val, T min, T max) {
return val < min ? min : (val > max ? max : val);
}
inline uint8_t round_to_5_bits(float val) {
return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31);
}
inline uint8_t round_to_4_bits(float val) {
return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15);
}
union Color {
struct BgraColorType {
uint8_t b;
uint8_t g;
uint8_t r;
uint8_t a;
} channels;
uint8_t components[4];
uint32_t bits;
};
// Codeword tables.
// See: Table 3.17.2
ALIGNAS(16) static const int16_t g_codeword_tables[8][4] = {
{-8, -2, 2, 8},
{-17, -5, 5, 17},
{-29, -9, 9, 29},
{-42, -13, 13, 42},
{-60, -18, 18, 60},
{-80, -24, 24, 80},
{-106, -33, 33, 106},
{-183, -47, 47, 183}};
// Maps modifier indices to pixel index values.
// See: Table 3.17.3
static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1};
// The ETC1 specification index texels as follows:
// [a][e][i][m] [ 0][ 4][ 8][12]
// [b][f][j][n] <-> [ 1][ 5][ 9][13]
// [c][g][k][o] [ 2][ 6][10][14]
// [d][h][l][p] [ 3][ 7][11][15]
// [ 0][ 1][ 2][ 3] [ 0][ 1][ 4][ 5]
// [ 4][ 5][ 6][ 7] <-> [ 8][ 9][12][13]
// [ 8][ 9][10][11] [ 2][ 3][ 6][ 7]
// [12][13][14][15] [10][11][14][15]
// However, when extracting sub blocks from BGRA data the natural array
// indexing order ends up different:
// vertical0: [a][e][b][f] horizontal0: [a][e][i][m]
// [c][g][d][h] [b][f][j][n]
// vertical1: [i][m][j][n] horizontal1: [c][g][k][o]
// [k][o][l][p] [d][h][l][p]
// In order to translate from the natural array indices in a sub block to the
// indices (number) used by specification and hardware we use this table.
static const uint8_t g_idx_to_num[4][8] = {
{0, 4, 1, 5, 2, 6, 3, 7}, // Vertical block 0.
{8, 12, 9, 13, 10, 14, 11, 15}, // Vertical block 1.
{0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0.
{2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1.
};
inline void WriteColors444(uint8_t* block,
const Color& color0,
const Color& color1) {
// Write output color for BGRA textures.
block[0] = (color0.channels.r & 0xf0) | (color1.channels.r >> 4);
block[1] = (color0.channels.g & 0xf0) | (color1.channels.g >> 4);
block[2] = (color0.channels.b & 0xf0) | (color1.channels.b >> 4);
}
inline void WriteColors555(uint8_t* block,
const Color& color0,
const Color& color1) {
// Table for conversion to 3-bit two complement format.
static const uint8_t two_compl_trans_table[8] = {
4, // -4 (100b)
5, // -3 (101b)
6, // -2 (110b)
7, // -1 (111b)
0, // 0 (000b)
1, // 1 (001b)
2, // 2 (010b)
3, // 3 (011b)
};
int16_t delta_r =
static_cast<int16_t>(color1.channels.r >> 3) - (color0.channels.r >> 3);
int16_t delta_g =
static_cast<int16_t>(color1.channels.g >> 3) - (color0.channels.g >> 3);
int16_t delta_b =
static_cast<int16_t>(color1.channels.b >> 3) - (color0.channels.b >> 3);
DCHECK_GE(delta_r, -4);
DCHECK_LE(delta_r, 3);
DCHECK_GE(delta_g, -4);
DCHECK_LE(delta_g, 3);
DCHECK_GE(delta_b, -4);
DCHECK_LE(delta_b, 3);
// Write output color for BGRA textures.
block[0] = (color0.channels.r & 0xf8) | two_compl_trans_table[delta_r + 4];
block[1] = (color0.channels.g & 0xf8) | two_compl_trans_table[delta_g + 4];
block[2] = (color0.channels.b & 0xf8) | two_compl_trans_table[delta_b + 4];
}
inline void WriteCodewordTable(uint8_t* block,
uint8_t sub_block_id,
uint8_t table) {
DCHECK_LT(sub_block_id, 2);
DCHECK_LT(table, 8);
uint8_t shift = (2 + (3 - sub_block_id * 3));
block[3] &= ~(0x07 << shift);
block[3] |= table << shift;
}
inline void WritePixelData(uint8_t* block, uint32_t pixel_data) {
block[4] |= pixel_data >> 24;
block[5] |= (pixel_data >> 16) & 0xff;
block[6] |= (pixel_data >> 8) & 0xff;
block[7] |= pixel_data & 0xff;
}
inline void WriteFlip(uint8_t* block, bool flip) {
block[3] &= ~0x01;
block[3] |= static_cast<uint8_t>(flip);
}
inline void WriteDiff(uint8_t* block, bool diff) {
block[3] &= ~0x02;
block[3] |= static_cast<uint8_t>(diff) << 1;
}
// Compress and rounds BGR888 into BGR444. The resulting BGR444 color is
// expanded to BGR888 as it would be in hardware after decompression. The
// actual 444-bit data is available in the four most significant bits of each
// channel.
inline Color MakeColor444(const float* bgr) {
uint8_t b4 = round_to_4_bits(bgr[0]);
uint8_t g4 = round_to_4_bits(bgr[1]);
uint8_t r4 = round_to_4_bits(bgr[2]);
Color bgr444;
bgr444.channels.b = (b4 << 4) | b4;
bgr444.channels.g = (g4 << 4) | g4;
bgr444.channels.r = (r4 << 4) | r4;
// Added to distinguish between expanded 555 and 444 colors.
bgr444.channels.a = 0x44;
return bgr444;
}
// Compress and rounds BGR888 into BGR555. The resulting BGR555 color is
// expanded to BGR888 as it would be in hardware after decompression. The
// actual 555-bit data is available in the five most significant bits of each
// channel.
inline Color MakeColor555(const float* bgr) {
uint8_t b5 = round_to_5_bits(bgr[0]);
uint8_t g5 = round_to_5_bits(bgr[1]);
uint8_t r5 = round_to_5_bits(bgr[2]);
Color bgr555;
bgr555.channels.b = (b5 << 3) | (b5 >> 2);
bgr555.channels.g = (g5 << 3) | (g5 >> 2);
bgr555.channels.r = (r5 << 3) | (r5 >> 2);
// Added to distinguish between expanded 555 and 444 colors.
bgr555.channels.a = 0x55;
return bgr555;
}
class CC_EXPORT TextureCompressorETC1 : public TextureCompressor {
public:
TextureCompressorETC1() {}
......
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "cc/resources/texture_compressor_etc1_sse.h"
#include <emmintrin.h>
#include "base/compiler_specific.h"
#include "base/logging.h"
// Using this header for common functions such as Color handling
// and codeword table.
#include "cc/resources/texture_compressor_etc1.h"
namespace cc {
namespace {
inline uint32_t SetETC1MaxError(uint32_t avg_error) {
// ETC1 codeword table is sorted in ascending order.
// Our algorithm will try to identify the index that generates the minimum
// error.
// The min error calculated during ComputeLuminance main loop will converge
// towards that value.
// We use this threshold to determine when it doesn't make sense to iterate
// further through the array.
return avg_error + avg_error / 2 + 384;
}
struct __sse_data {
// This is used to store raw data.
uint8_t* block;
// This is used to store 8 bit packed values.
__m128i* packed;
// This is used to store 32 bit zero extended values into 4x4 arrays.
__m128i* blue;
__m128i* green;
__m128i* red;
};
// Commonly used registers throughout the code.
static const __m128i __sse_zero = _mm_set1_epi32(0);
static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);
inline __m128i AddAndClamp(const __m128i x, const __m128i y) {
static const __m128i color_max = _mm_set1_epi32(0xFF);
return _mm_max_epi16(__sse_zero,
_mm_min_epi16(_mm_add_epi16(x, y), color_max));
}
inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {
// Changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2).
__m128i ret = _mm_sub_epi16(x, y);
return _mm_mullo_epi16(ret, ret);
}
inline __m128i AddChannelError(const __m128i x,
const __m128i y,
const __m128i z) {
return _mm_add_epi32(x, _mm_add_epi32(y, z));
}
inline uint32_t SumSSE(const __m128i x) {
__m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
return _mm_cvtsi128_si32(sum);
}
inline uint32_t GetVerticalError(const __sse_data* data,
const __m128i* blue_avg,
const __m128i* green_avg,
const __m128i* red_avg,
uint32_t* verror) {
__m128i error = __sse_zero;
for (int i = 0; i < 4; i++) {
error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));
error =
_mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));
error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));
}
error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
verror[0] = _mm_cvtsi128_si32(error);
verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
return verror[0] + verror[1];
}
inline uint32_t GetHorizontalError(const __sse_data* data,
const __m128i* blue_avg,
const __m128i* green_avg,
const __m128i* red_avg,
uint32_t* verror) {
__m128i error = __sse_zero;
int first_index, second_index;
for (int i = 0; i < 2; i++) {
first_index = 2 * i;
second_index = first_index + 1;
error = _mm_add_epi32(
error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));
error = _mm_add_epi32(
error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));
error = _mm_add_epi32(
error, GetColorErrorSSE(data->green[first_index], green_avg[i]));
error = _mm_add_epi32(
error, GetColorErrorSSE(data->green[second_index], green_avg[i]));
error = _mm_add_epi32(error,
GetColorErrorSSE(data->red[first_index], red_avg[i]));
error = _mm_add_epi32(
error, GetColorErrorSSE(data->red[second_index], red_avg[i]));
}
error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
verror[0] = _mm_cvtsi128_si32(error);
verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
return verror[0] + verror[1];
}
inline void GetAvgColors(const __sse_data* data,
float* output,
bool* __sse_use_diff) {
__m128i sum[2], tmp;
// TODO(radu.velea): _mm_avg_epu8 on packed data maybe.
// Compute avg red value.
// [S0 S0 S1 S1]
sum[0] = _mm_add_epi32(data->red[0], data->red[1]);
sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
// [S2 S2 S3 S3]
sum[1] = _mm_add_epi32(data->red[2], data->red[3]);
sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
float hred[2], vred[2];
hred[0] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
8.0f;
hred[1] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
8.0f;
tmp = _mm_add_epi32(sum[0], sum[1]);
vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
// Compute avg green value.
// [S0 S0 S1 S1]
sum[0] = _mm_add_epi32(data->green[0], data->green[1]);
sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
// [S2 S2 S3 S3]
sum[1] = _mm_add_epi32(data->green[2], data->green[3]);
sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
float hgreen[2], vgreen[2];
hgreen[0] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
8.0f;
hgreen[1] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
8.0f;
tmp = _mm_add_epi32(sum[0], sum[1]);
vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
// Compute avg blue value.
// [S0 S0 S1 S1]
sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);
sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
// [S2 S2 S3 S3]
sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);
sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
float hblue[2], vblue[2];
hblue[0] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
8.0f;
hblue[1] = (_mm_cvtsi128_si32(
_mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
8.0f;
tmp = _mm_add_epi32(sum[0], sum[1]);
vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
// TODO(radu.velea): Return int's instead of floats, based on Quality.
output[0] = vblue[0];
output[1] = vgreen[0];
output[2] = vred[0];
output[3] = vblue[1];
output[4] = vgreen[1];
output[5] = vred[1];
output[6] = hblue[0];
output[7] = hgreen[0];
output[8] = hred[0];
output[9] = hblue[1];
output[10] = hgreen[1];
output[11] = hred[1];
__m128i threshold_upper = _mm_set1_epi32(3);
__m128i threshold_lower = _mm_set1_epi32(-4);
__m128 factor_v = _mm_set1_ps(31.0f / 255.0f);
__m128 rounding_v = _mm_set1_ps(0.5f);
__m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);
__m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);
__m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);
__m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);
h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);
h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);
v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);
v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);
h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);
h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);
v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);
v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);
__m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);
__m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);
__m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);
__m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);
h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);
v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);
__sse_use_diff[0] =
(0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));
__sse_use_diff[0] &=
(0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));
__sse_use_diff[1] =
(0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));
__sse_use_diff[1] &=
(0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));
}
void ComputeLuminance(uint8_t* block,
const Color& base,
const int sub_block_id,
const uint8_t* idx_to_num_tab,
const __sse_data* data,
const uint32_t expected_error) {
uint8_t best_tbl_idx = 0;
uint32_t best_error = 0x7FFFFFFF;
uint8_t best_mod_idx[8][8]; // [table][texel]
const __m128i base_blue = _mm_set1_epi32(base.channels.b);
const __m128i base_green = _mm_set1_epi32(base.channels.g);
const __m128i base_red = _mm_set1_epi32(base.channels.r);
__m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;
__m128i block_error, mask;
// This will have the minimum errors for each 4 pixels.
__m128i first_half_min;
__m128i second_half_min;
// This will have the matching table index combo for each 4 pixels.
__m128i first_half_pattern;
__m128i second_half_pattern;
const __m128i first_blue_data_block = data->blue[2 * sub_block_id];
const __m128i first_green_data_block = data->green[2 * sub_block_id];
const __m128i first_red_data_block = data->red[2 * sub_block_id];
const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];
const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];
const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];
uint32_t min;
// Fail early to increase speed.
long delta = INT32_MAX;
uint32_t last_min = INT32_MAX;
const uint8_t shuffle_mask[] = {
0x1B, 0x4E, 0xB1, 0xE4}; // Important they are sorted ascending.
for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
tmp = _mm_set_epi32(
g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],
g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);
test_blue = AddAndClamp(tmp, base_blue);
test_green = AddAndClamp(tmp, base_green);
test_red = AddAndClamp(tmp, base_red);
first_half_min = __sse_max_int;
second_half_min = __sse_max_int;
first_half_pattern = __sse_zero;
second_half_pattern = __sse_zero;
for (uint8_t imm8 : shuffle_mask) {
switch (imm8) {
case 0x1B:
tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);
tmp_green = _mm_shuffle_epi32(test_green, 0x1B);
tmp_red = _mm_shuffle_epi32(test_red, 0x1B);
break;
case 0x4E:
tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);
tmp_green = _mm_shuffle_epi32(test_green, 0x4E);
tmp_red = _mm_shuffle_epi32(test_red, 0x4E);
break;
case 0xB1:
tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);
tmp_green = _mm_shuffle_epi32(test_green, 0xB1);
tmp_red = _mm_shuffle_epi32(test_red, 0xB1);
break;
case 0xE4:
tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);
tmp_green = _mm_shuffle_epi32(test_green, 0xE4);
tmp_red = _mm_shuffle_epi32(test_red, 0xE4);
break;
default:
tmp_blue = test_blue;
tmp_green = test_green;
tmp_red = test_red;
}
tmp = _mm_set1_epi32(imm8);
block_error =
AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),
GetColorErrorSSE(tmp_green, first_green_data_block),
GetColorErrorSSE(tmp_red, first_red_data_block));
// Save winning pattern.
first_half_pattern = _mm_max_epi16(
first_half_pattern,
_mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));
// Should use _mm_min_epi32(first_half_min, block_error); from SSE4
// otherwise we have a small performance penalty.
mask = _mm_cmplt_epi32(block_error, first_half_min);
first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
_mm_andnot_si128(mask, first_half_min));
// Compute second part of the block.
block_error =
AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),
GetColorErrorSSE(tmp_green, second_green_data_block),
GetColorErrorSSE(tmp_red, second_red_data_block));
// Save winning pattern.
second_half_pattern = _mm_max_epi16(
second_half_pattern,
_mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));
// Should use _mm_min_epi32(second_half_min, block_error); from SSE4
// otherwise we have a small performance penalty.
mask = _mm_cmplt_epi32(block_error, second_half_min);
second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
_mm_andnot_si128(mask, second_half_min));
}
first_half_min = _mm_add_epi32(first_half_min, second_half_min);
first_half_min =
_mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));
first_half_min =
_mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));
min = _mm_cvtsi128_si32(first_half_min);
delta = min - last_min;
last_min = min;
if (min < best_error) {
best_tbl_idx = tbl_idx;
best_error = min;
best_mod_idx[tbl_idx][0] =
(_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;
best_mod_idx[tbl_idx][4] =
(_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;
best_mod_idx[tbl_idx][1] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>
(2)) &
3;
best_mod_idx[tbl_idx][5] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>
(2)) &
3;
best_mod_idx[tbl_idx][2] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>
(4)) &
3;
best_mod_idx[tbl_idx][6] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>
(4)) &
3;
best_mod_idx[tbl_idx][3] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>
(6)) &
3;
best_mod_idx[tbl_idx][7] =
(_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>
(6)) &
3;
if (best_error == 0) {
break;
}
} else if (delta > 0 && expected_error < min) {
// The error is growing and is well beyond expected threshold.
break;
}
}
WriteCodewordTable(block, sub_block_id, best_tbl_idx);
uint32_t pix_data = 0;
uint8_t mod_idx;
uint8_t pix_idx;
uint32_t lsb;
uint32_t msb;
int texel_num;
for (unsigned int i = 0; i < 8; ++i) {
mod_idx = best_mod_idx[best_tbl_idx][i];
pix_idx = g_mod_to_pix[mod_idx];
lsb = pix_idx & 0x1;
msb = pix_idx >> 1;
// Obtain the texel number as specified in the standard.
texel_num = idx_to_num_tab[i];
pix_data |= msb << (texel_num + 16);
pix_data |= lsb << (texel_num);
}
WritePixelData(block, pix_data);
}
void CompressBlock(uint8_t* dst, __sse_data* data) {
// First 3 values are for vertical 1, second 3 vertical 2, third 3 horizontal
// 1, last 3
// horizontal 2.
float __sse_avg_colors[12] = {
0,
};
bool use_differential[2] = {true, true};
GetAvgColors(data, __sse_avg_colors, use_differential);
Color sub_block_avg[4];
// TODO(radu.velea): Remove floating point operations and use only int's +
// normal rounding and shifts for reduced Quality.
for (int i = 0, j = 1; i < 4; i += 2, j += 2) {
if (use_differential[i / 2] == false) {
sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);
sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);
} else {
sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);
sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);
}
}
__m128i red_avg[2], green_avg[2], blue_avg[2];
// TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
blue_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[3]),
static_cast<int>(__sse_avg_colors[3]),
static_cast<int>(__sse_avg_colors[0]),
static_cast<int>(__sse_avg_colors[0]));
green_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[4]),
static_cast<int>(__sse_avg_colors[4]),
static_cast<int>(__sse_avg_colors[1]),
static_cast<int>(__sse_avg_colors[1]));
red_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[5]),
static_cast<int>(__sse_avg_colors[5]),
static_cast<int>(__sse_avg_colors[2]),
static_cast<int>(__sse_avg_colors[2]));
uint32_t vertical_error[2];
GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);
// TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
blue_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[6]));
blue_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[9]));
green_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[7]));
green_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[10]));
red_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[8]));
red_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[11]));
uint32_t horizontal_error[2];
GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);
bool flip = (horizontal_error[0] + horizontal_error[1]) <
(vertical_error[0] + vertical_error[1]);
uint32_t* expected_errors = flip ? horizontal_error : vertical_error;
// Clear destination buffer so that we can "or" in the results.
memset(dst, 0, 8);
WriteDiff(dst, use_differential[!!flip]);
WriteFlip(dst, flip);
uint8_t sub_block_off_0 = flip ? 2 : 0;
uint8_t sub_block_off_1 = sub_block_off_0 + 1;
if (use_differential[!!flip]) {
WriteColors555(dst, sub_block_avg[sub_block_off_0],
sub_block_avg[sub_block_off_1]);
} else {
WriteColors444(dst, sub_block_avg[sub_block_off_0],
sub_block_avg[sub_block_off_1]);
}
if (!flip) {
// Transpose vertical data into horizontal lines.
__m128i tmp;
for (int i = 0; i < 4; i += 2) {
tmp = data->blue[i];
data->blue[i] = _mm_add_epi32(
_mm_move_epi64(data->blue[i]),
_mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));
data->blue[i + 1] = _mm_add_epi32(
_mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
_mm_shuffle_epi32(
_mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),
0x4E));
tmp = data->green[i];
data->green[i] = _mm_add_epi32(
_mm_move_epi64(data->green[i]),
_mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));
data->green[i + 1] = _mm_add_epi32(
_mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
_mm_shuffle_epi32(
_mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),
0x4E));
tmp = data->red[i];
data->red[i] = _mm_add_epi32(
_mm_move_epi64(data->red[i]),
_mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));
data->red[i + 1] = _mm_add_epi32(
_mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
_mm_shuffle_epi32(
_mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));
}
tmp = data->blue[1];
data->blue[1] = data->blue[2];
data->blue[2] = tmp;
tmp = data->green[1];
data->green[1] = data->green[2];
data->green[2] = tmp;
tmp = data->red[1];
data->red[1] = data->red[2];
data->red[2] = tmp;
}
// Compute luminance for the first sub block.
ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,
g_idx_to_num[sub_block_off_0], data,
SetETC1MaxError(expected_errors[0]));
// Compute luminance for the second sub block.
ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,
g_idx_to_num[sub_block_off_1], data,
SetETC1MaxError(expected_errors[1]));
}
static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {
for (int j = 0; j < 4; ++j) {
memcpy(&dst[j * 4 * 4], src, 4 * 4);
src += width * 4;
}
}
inline bool TransposeBlock(uint8_t* block, __m128i* transposed) {
// This function transforms an incommig block of RGBA or GBRA pixels into 4
// registers, each containing the data corresponding for a single channel.
// Ex: transposed[0] will have all the R values for a RGBA block,
// transposed[1] will have G, etc.
// The values are packed as 8 bit unsigned values in the SSE registers.
// Before doing any work we check if the block is solid.
__m128i tmp3, tmp2, tmp1, tmp0;
__m128i test_solid = _mm_set1_epi32(*((uint32_t*)block));
uint16_t mask = 0xFFFF;
// a0,a1,a2,...a7, ...a15
transposed[0] = _mm_loadu_si128((__m128i*)(block));
// b0, b1,b2,...b7.... b15
transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));
// c0, c1,c2,...c7....c15
transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));
// d0,d1,d2,...d7....d15
transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));
for (int i = 0; i < 4; i++) {
mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));
}
if (mask == 0xFFFF) {
// Block is solid, no need to do any more work.
return false;
}
// a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);
// c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);
// a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);
// c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);
// a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
// a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
// c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
// c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
// a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);
// a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);
// a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13
tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);
// a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15
tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);
// a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12
transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
// a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13
transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
// a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14
transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
// a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15
transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
return true;
}
inline void UnpackBlock(__m128i* packed,
__m128i* red,
__m128i* green,
__m128i* blue,
__m128i* alpha) {
const __m128i zero = _mm_set1_epi8(0);
__m128i tmp_low, tmp_high;
// Unpack red.
tmp_low = _mm_unpacklo_epi8(packed[0], zero);
tmp_high = _mm_unpackhi_epi8(packed[0], zero);
red[0] = _mm_unpacklo_epi16(tmp_low, zero);
red[1] = _mm_unpackhi_epi16(tmp_low, zero);
red[2] = _mm_unpacklo_epi16(tmp_high, zero);
red[3] = _mm_unpackhi_epi16(tmp_high, zero);
// Unpack green.
tmp_low = _mm_unpacklo_epi8(packed[1], zero);
tmp_high = _mm_unpackhi_epi8(packed[1], zero);
green[0] = _mm_unpacklo_epi16(tmp_low, zero);
green[1] = _mm_unpackhi_epi16(tmp_low, zero);
green[2] = _mm_unpacklo_epi16(tmp_high, zero);
green[3] = _mm_unpackhi_epi16(tmp_high, zero);
// Unpack blue.
tmp_low = _mm_unpacklo_epi8(packed[2], zero);
tmp_high = _mm_unpackhi_epi8(packed[2], zero);
blue[0] = _mm_unpacklo_epi16(tmp_low, zero);
blue[1] = _mm_unpackhi_epi16(tmp_low, zero);
blue[2] = _mm_unpacklo_epi16(tmp_high, zero);
blue[3] = _mm_unpackhi_epi16(tmp_high, zero);
// Unpack alpha - unused for ETC1.
tmp_low = _mm_unpacklo_epi8(packed[3], zero);
tmp_high = _mm_unpackhi_epi8(packed[3], zero);
alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);
alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);
alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);
alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);
}
inline void CompressSolid(uint8_t* dst, uint8_t* block) {
// Clear destination buffer so that we can "or" in the results.
memset(dst, 0, 8);
const float src_color_float[3] = {static_cast<float>(block[0]),
static_cast<float>(block[1]),
static_cast<float>(block[2])};
const Color base = MakeColor555(src_color_float);
const __m128i base_v =
_mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);
const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);
__m128i lum;
__m128i colors[4];
static const __m128i rgb =
_mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
WriteDiff(dst, true);
WriteFlip(dst, false);
WriteColors555(dst, base, base);
uint8_t best_tbl_idx = 0;
uint8_t best_mod_idx = 0;
uint32_t best_mod_err = INT32_MAX;
for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
lum = _mm_set_epi32(
g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],
g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);
colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));
colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));
colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));
colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));
for (int i = 0; i < 4; i++) {
uint32_t mod_err =
SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));
colors[i] = _mm_and_si128(colors[i], rgb);
if (mod_err < best_mod_err) {
best_tbl_idx = tbl_idx;
best_mod_idx = i;
best_mod_err = mod_err;
if (mod_err == 0) {
break; // We cannot do any better than this.
}
}
}
}
WriteCodewordTable(dst, 0, best_tbl_idx);
WriteCodewordTable(dst, 1, best_tbl_idx);
uint8_t pix_idx = g_mod_to_pix[best_mod_idx];
uint32_t lsb = pix_idx & 0x1;
uint32_t msb = pix_idx >> 1;
uint32_t pix_data = 0;
for (unsigned int i = 0; i < 2; ++i) {
for (unsigned int j = 0; j < 8; ++j) {
// Obtain the texel number as specified in the standard.
int texel_num = g_idx_to_num[i][j];
pix_data |= msb << (texel_num + 16);
pix_data |= lsb << (texel_num);
}
}
WritePixelData(dst, pix_data);
}
} // namespace
void TextureCompressorETC1SSE::Compress(const uint8_t* src,
uint8_t* dst,
int width,
int height,
Quality quality) {
DCHECK_GE(width, 4);
DCHECK_EQ((width & 3), 0);
DCHECK_GE(height, 4);
DCHECK_EQ((height & 3), 0);
ALIGNAS(16) uint8_t block[64];
__m128i packed[4];
__m128i red[4], green[4], blue[4], alpha[4];
__sse_data data;
for (int y = 0; y < height; y += 4, src += width * 4 * 4) {
for (int x = 0; x < width; x += 4, dst += 8) {
ExtractBlock(block, src + x * 4, width);
if (TransposeBlock(block, packed) == false) {
CompressSolid(dst, block);
} else {
UnpackBlock(packed, blue, green, red, alpha);
data.block = block;
data.packed = packed;
data.red = red;
data.blue = blue;
data.green = green;
CompressBlock(dst, &data);
}
}
}
}
} // namespace cc
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CC_RESOURCES_TEXTURE_COMPRESSOR_ETC1_SSE_H_
#define CC_RESOURCES_TEXTURE_COMPRESSOR_ETC1_SSE_H_
#include "cc/resources/texture_compressor.h"
namespace cc {
class CC_EXPORT TextureCompressorETC1SSE : public TextureCompressor {
public:
TextureCompressorETC1SSE() {}
// Compress a texture using ETC1. Note that the |quality| parameter is
// ignored. The current implementation does not support different quality
// settings.
void Compress(const uint8_t* src,
uint8_t* dst,
int width,
int height,
Quality quality) override;
private:
DISALLOW_COPY_AND_ASSIGN(TextureCompressorETC1SSE);
};
} // namespace cc
#endif // CC_RESOURCES_TEXTURE_COMPRESSOR_ETC1_SSE_H_
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "cc/resources/texture_compressor.h"
#include "cc/base/util.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace cc {
namespace {
const int kImageWidth = 256;
const int kImageHeight = 256;
const int kImageChannels = 4;
const int kImageSizeInBytes = kImageWidth * kImageHeight * kImageChannels;
TEST(TextureCompressorETC1Test, Compress256x256Ratio) {
scoped_ptr<TextureCompressor> compressor =
TextureCompressor::Create(TextureCompressor::kFormatETC1);
uint8_t src[kImageSizeInBytes];
uint8_t dst[kImageSizeInBytes];
const unsigned int kImagePoison = 0xDEADBEEF;
// Poison destination bytes so we can see how much has been
// overwritten by compression algorithm.
uint32_t* dst_32 = reinterpret_cast<uint32_t*>(dst);
for (int i = 0; i < kImageWidth * kImageHeight; i++) {
dst_32[i] = kImagePoison;
}
// Generate test texture.
for (int i = 0; i < kImageSizeInBytes; i++) {
src[i] = i % 256;
}
compressor->Compress(src, dst, kImageWidth, kImageHeight,
TextureCompressor::kQualityLow);
int compressed_size = 0;
for (compressed_size = 0; compressed_size < kImageWidth * kImageHeight;
compressed_size++) {
if (dst_32[compressed_size] == kImagePoison) {
// Represents size in bytes of the compressed block.
compressed_size = compressed_size * 4;
break;
}
}
// Check if compression ratio is 8:1 for RGBA or BGRA images, after discarding
// alpha channel.
EXPECT_EQ(kImageSizeInBytes, compressed_size * 8);
}
} // namespace
} // namespace cc
......@@ -17,8 +17,12 @@ const int kTimeCheckInterval = 10;
const int kImageWidth = 256;
const int kImageHeight = 256;
const int kImageChannels = 4;
const int kImageSizeInBytes = kImageWidth * kImageHeight * kImageChannels;
const int kImageSizeInBytes = kImageWidth * kImageHeight * 4;
const TextureCompressor::Quality kQualities[] = {
TextureCompressor::kQualityLow,
TextureCompressor::kQualityMedium,
TextureCompressor::kQualityHigh};
std::string FormatName(TextureCompressor::Format format) {
switch (format) {
......@@ -45,9 +49,7 @@ std::string QualityName(TextureCompressor::Quality quality) {
}
class TextureCompressorPerfTest
: public testing::TestWithParam<
::testing::tuple<TextureCompressor::Quality,
TextureCompressor::Format>> {
: public testing::TestWithParam<TextureCompressor::Format> {
public:
TextureCompressorPerfTest()
: timer_(kWarmupRuns,
......@@ -55,20 +57,18 @@ class TextureCompressorPerfTest
kTimeCheckInterval) {}
void SetUp() override {
TextureCompressor::Format format = ::testing::get<1>(GetParam());
TextureCompressor::Format format = GetParam();
compressor_ = TextureCompressor::Create(format);
}
void RunTest(const std::string& name) {
TextureCompressor::Quality quality = ::testing::get<0>(GetParam());
void RunTest(const std::string& name, TextureCompressor::Quality quality) {
timer_.Reset();
do {
compressor_->Compress(src_, dst_, kImageWidth, kImageHeight, quality);
timer_.NextLap();
} while (!timer_.HasTimeLimitExpired());
TextureCompressor::Format format = ::testing::get<1>(GetParam());
std::string str = FormatName(format) + " " + QualityName(quality);
std::string str = FormatName(GetParam()) + " " + QualityName(quality);
perf_test::PrintResult("Compress256x256", name, str, timer_.MsPerLap(),
"us", true);
}
......@@ -80,42 +80,24 @@ class TextureCompressorPerfTest
uint8_t dst_[kImageSizeInBytes];
};
TEST_P(TextureCompressorPerfTest, Compress256x256BlackAndWhiteGradientImage) {
TEST_P(TextureCompressorPerfTest, Compress256x256Image) {
for (int i = 0; i < kImageSizeInBytes; ++i)
src_[i] = i % 256;
RunTest("BlackAndWhiteGradientImage");
for (auto& quality : kQualities)
RunTest("Image", quality);
}
TEST_P(TextureCompressorPerfTest, Compress256x256SolidBlackImage) {
TEST_P(TextureCompressorPerfTest, Compress256x256SolidImage) {
memset(src_, 0, kImageSizeInBytes);
RunTest("SolidBlackImage");
}
TEST_P(TextureCompressorPerfTest, Compress256x256SolidColorImage) {
for (int i = 0; i < kImageSizeInBytes; ++i)
src_[i] = (4 - i % 4) * 50;
RunTest("SolidColorImage");
for (auto& quality : kQualities)
RunTest("SolidImage", quality);
}
TEST_P(TextureCompressorPerfTest, Compress256x256RandomColorImage) {
unsigned int kImageSeed = 1234567890;
srand(kImageSeed);
for (int i = 0; i < kImageSizeInBytes; ++i)
src_[i] = rand() % 256; // NOLINT
RunTest("RandomColorImage");
}
INSTANTIATE_TEST_CASE_P(
TextureCompressorPerfTests,
INSTANTIATE_TEST_CASE_P(TextureCompressorPerfTests,
TextureCompressorPerfTest,
::testing::Combine(::testing::Values(TextureCompressor::kQualityLow,
TextureCompressor::kQualityMedium,
TextureCompressor::kQualityHigh),
::testing::Values(TextureCompressor::kFormatETC1)));
::testing::Values(TextureCompressor::kFormatETC1));
} // namespace
} // namespace cc
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment