-
-
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
| From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001 | |
| From: Ilya Kurdyukov <jpegqs@gmail.com> | |
| Date: Tue, 14 Dec 2021 21:54:32 +0700 | |
| Subject: [PATCH] faster lzma_decoder for x86 | |
| Notice: Uses inline assembly with CMOV instruction. | |
| Another change that removes the comparison with in_size can give a few | |
| percent speedup for architectures with a small number of registers. | |
| --- | |
| src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++------------- | |
| src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++-- | |
| 2 files changed, 113 insertions(+), 43 deletions(-) | |
| diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c | |
| index e605a0a..ecb804b 100644 | |
| --- a/src/liblzma/lzma/lzma_decoder.c | |
| +++ b/src/liblzma/lzma/lzma_decoder.c | |
| @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
| = offset + match_bit | |
| + symbol; | |
| - rc_bit(probs[subcoder_index], | |
| - offset &= ~match_bit, | |
| - offset &= match_bit, | |
| + rc_bit_matched(probs[subcoder_index], | |
| SEQ_LITERAL_MATCHED); | |
| // It seems to be faster to do this | |
| @@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
| case seq: \ | |
| match_bit = len & offset; \ | |
| subcoder_index = offset + match_bit + symbol; \ | |
| - rc_bit(probs[subcoder_index], \ | |
| - offset &= ~match_bit, \ | |
| - offset &= match_bit, \ | |
| - seq) | |
| + rc_bit_matched(probs[subcoder_index], seq) | |
| d(SEQ_LITERAL_MATCHED0); | |
| len <<= 1; | |
| @@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
| probs = coder->pos_special + rep0 | |
| - symbol - 1; | |
| symbol = 1; | |
| - offset = 0; | |
| - case SEQ_DIST_MODEL: | |
| + offset = 1; | |
| #ifdef HAVE_SMALL | |
| + limit = 1 << limit; | |
| + case SEQ_DIST_MODEL: | |
| do { | |
| - rc_bit(probs[symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| - } while (++offset < limit); | |
| + offset <<= 1; | |
| + } while (offset < limit); | |
| #else | |
| + case SEQ_DIST_MODEL: | |
| switch (limit) { | |
| case 5: | |
| assert(offset == 0); | |
| - rc_bit(probs[symbol], , | |
| - rep0 += 1U, | |
| + rc_bit_dist(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| - ++offset; | |
| + offset <<= 1; | |
| --limit; | |
| case 4: | |
| - rc_bit(probs[symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| - ++offset; | |
| + offset <<= 1; | |
| --limit; | |
| case 3: | |
| - rc_bit(probs[symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| - ++offset; | |
| + offset <<= 1; | |
| --limit; | |
| case 2: | |
| - rc_bit(probs[symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| - ++offset; | |
| + offset <<= 1; | |
| --limit; | |
| case 1: | |
| // We need "symbol" only for | |
| @@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
| // rc_bit_last() here to omit | |
| // the unneeded updating of | |
| // "symbol". | |
| - rc_bit_last(probs[symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist_last(probs[symbol], | |
| + offset, | |
| SEQ_DIST_MODEL); | |
| } | |
| #endif | |
| @@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
| rep0 <<= ALIGN_BITS; | |
| symbol = 1; | |
| #ifdef HAVE_SMALL | |
| - offset = 0; | |
| + offset = 1; | |
| case SEQ_ALIGN: | |
| do { | |
| - rc_bit(coder->pos_align[ | |
| - symbol], , | |
| - rep0 += 1U << offset, | |
| + rc_bit_dist(coder->pos_align[ | |
| + symbol], | |
| + offset, | |
| SEQ_ALIGN); | |
| - } while (++offset < ALIGN_BITS); | |
| + offset <<= 1; | |
| + } while (offset < (1 << ALIGN_BITS)); | |
| #else | |
| case SEQ_ALIGN0: | |
| - rc_bit(coder->pos_align[symbol], , | |
| - rep0 += 1, SEQ_ALIGN0); | |
| + rc_bit_dist(coder->pos_align[symbol], | |
| + 1, SEQ_ALIGN0); | |
| case SEQ_ALIGN1: | |
| - rc_bit(coder->pos_align[symbol], , | |
| - rep0 += 2, SEQ_ALIGN1); | |
| + rc_bit_dist(coder->pos_align[symbol], | |
| + 2, SEQ_ALIGN1); | |
| case SEQ_ALIGN2: | |
| - rc_bit(coder->pos_align[symbol], , | |
| - rep0 += 4, SEQ_ALIGN2); | |
| + rc_bit_dist(coder->pos_align[symbol], | |
| + 4, SEQ_ALIGN2); | |
| case SEQ_ALIGN3: | |
| // Like in SEQ_DIST_MODEL, we don't | |
| // need "symbol" for anything else | |
| // than indexing the probability array. | |
| - rc_bit_last(coder->pos_align[symbol], , | |
| - rep0 += 8, SEQ_ALIGN3); | |
| + rc_bit_dist_last(coder->pos_align[ | |
| + symbol], | |
| + 8, SEQ_ALIGN3); | |
| #endif | |
| if (rep0 == UINT32_MAX) { | |
| diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h | |
| index e0b051f..cc9ac35 100644 | |
| --- a/src/liblzma/rangecoder/range_decoder.h | |
| +++ b/src/liblzma/rangecoder/range_decoder.h | |
| @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
| /// variables `in' and `in_size' to be defined. | |
| #define rc_to_local(range_decoder, in_pos) \ | |
| lzma_range_decoder rc = range_decoder; \ | |
| - size_t rc_in_pos = (in_pos); \ | |
| + size_t rc_in_pos = in_pos - in_size; \ | |
| + const uint8_t *restrict rc_end = in + in_size; \ | |
| uint32_t rc_bound | |
| @@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
| #define rc_from_local(range_decoder, in_pos) \ | |
| do { \ | |
| range_decoder = rc; \ | |
| - in_pos = rc_in_pos; \ | |
| + in_pos = rc_in_pos + in_size; \ | |
| } while (0) | |
| @@ -87,12 +88,13 @@ do { \ | |
| #define rc_normalize(seq) \ | |
| do { \ | |
| if (rc.range < RC_TOP_VALUE) { \ | |
| - if (unlikely(rc_in_pos == in_size)) { \ | |
| + if (unlikely(!rc_in_pos)) { \ | |
| coder->sequence = seq; \ | |
| goto out; \ | |
| } \ | |
| + rc.code <<= RC_SHIFT_BITS; \ | |
| rc.range <<= RC_SHIFT_BITS; \ | |
| - rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ | |
| + rc.code |= rc_end[rc_in_pos++]; \ | |
| } \ | |
| } while (0) | |
| @@ -151,11 +153,79 @@ do { \ | |
| /// Decodes one bit, updates "symbol", and runs action0 or action1 depending | |
| /// on the decoded bit. | |
| +#ifndef NO_BRANCH_OPT | |
| +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | |
| +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
| +do { \ | |
| + rc_normalize(seq); \ | |
| + uint32_t cache = (prob), tmp; \ | |
| + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
| + rc.range -= rc_bound; \ | |
| + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
| + __asm__ ( \ | |
| + "cmpl %3, %2\n\t" \ | |
| + "cmovbl %3, %1\n\t" \ | |
| + "sbbl %0, %0" \ | |
| + : "=&r"(tmp), "+&r"(rc.range) \ | |
| + : "r"(rc.code), "r"(rc_bound) \ | |
| + : "flags" \ | |
| + ); \ | |
| + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
| + prob -= cache >> RC_MOVE_BITS; \ | |
| + pre0; tmp = ~tmp; pre1; \ | |
| + rc.code -= tmp & rc_bound; \ | |
| + if (!tmp) { action0; } else { action1; }; \ | |
| +} while (0) | |
| +#elif defined(__GNUC__) && defined(__aarch64__) | |
| +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
| +do { \ | |
| + rc_normalize(seq); \ | |
| + uint32_t cache = (prob), tmp; \ | |
| + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
| + rc.range -= rc_bound; \ | |
| + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
| + __asm__ ( \ | |
| + "cmp %w2, %w3\n\t" \ | |
| + "csel %w1, %w3, %w1, lo\n\t" \ | |
| + "csetm %w0, lo" \ | |
| + : "=&r"(tmp), "+&r"(rc.range) \ | |
| + : "r"(rc.code), "r"(rc_bound) \ | |
| + : "cc" \ | |
| + ); \ | |
| + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
| + prob -= cache >> RC_MOVE_BITS; \ | |
| + pre0; tmp = ~tmp; pre1; \ | |
| + rc.code -= tmp & rc_bound; \ | |
| + if (!tmp) { action0; } else { action1; }; \ | |
| +} while (0) | |
| +#endif | |
| +#endif | |
| +#ifdef rc_bit_nobranch | |
| +#define rc_bit(prob, action0, action1, seq) \ | |
| + rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ | |
| + action0, action1, seq) | |
| +#define rc_bit_matched(prob, seq) \ | |
| + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \ | |
| + symbol = (symbol << 1) - tmp, , , seq) | |
| +#define rc_bit_dist(prob, offset, seq) \ | |
| + rc_bit_nobranch(prob, , \ | |
| + symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \ | |
| + , , seq); | |
| +#define rc_bit_dist_last(prob, offset, seq) \ | |
| + rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq); | |
| +#else | |
| #define rc_bit(prob, action0, action1, seq) \ | |
| rc_bit_last(prob, \ | |
| symbol <<= 1; action0, \ | |
| symbol = (symbol << 1) + 1; action1, \ | |
| seq); | |
| +#define rc_bit_matched(prob, seq) \ | |
| + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) | |
| +#define rc_bit_dist(prob, offset, seq) \ | |
| + rc_bit(prob, , rep0 += offset, seq); | |
| +#define rc_bit_dist_last(prob, offset, seq) \ | |
| + rc_bit_last(prob, , rep0 += offset, seq) | |
| +#endif | |
| /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled | |
| -- | |
| 2.17.1 |
Intel Celeron 1007U (Ivy Bridge)
linux-5.15.7.tar.xz : 19.33 --> 18.07 (+7%)
linux-firmware-20211027.tar.xz : 20.57 --> 17.76 (+16%)
Ryzen 4500U (Zen 2)
Before:
Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):
11,907.43 msec task-clock:u # 1.000 CPUs utilized ( +- 0.03% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
17,240 page-faults:u # 1.448 K/sec ( +- 0.02% )
27,992,949,586 cycles:u # 2.351 GHz ( +- 0.04% )
583,797,051 stalled-cycles-frontend:u # 2.09% frontend cycles idle ( +- 0.06% )
3,412,760,275 stalled-cycles-backend:u # 12.19% backend cycles idle ( +- 0.36% )
35,183,073,831 instructions:u # 1.26 insn per cycle
# 0.10 stalled cycles per insn ( +- 0.00% )
4,182,899,643 branches:u # 351.285 M/sec ( +- 0.00% )
531,756,997 branch-misses:u # 12.71% of all branches ( +- 0.01% )
11.90746 +- 0.00389 seconds time elapsed ( +- 0.03% )
After:
Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):
9,797.98 msec task-clock:u # 1.000 CPUs utilized ( +- 0.11% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
17,247 page-faults:u # 1.760 K/sec ( +- 0.02% )
23,012,154,561 cycles:u # 2.349 GHz ( +- 0.02% )
173,975,497 stalled-cycles-frontend:u # 0.76% frontend cycles idle ( +- 0.09% )
12,818,655,196 stalled-cycles-backend:u # 55.70% backend cycles idle ( +- 0.02% )
39,198,156,956 instructions:u # 1.70 insn per cycle
# 0.33 stalled cycles per insn ( +- 0.00% )
2,970,714,992 branches:u # 303.197 M/sec ( +- 0.00% )
156,064,253 branch-misses:u # 5.25% of all branches ( +- 0.01% )
9.7976 +- 0.0107 seconds time elapsed ( +- 0.11% )
Did a minor update to cover the rc_bit() macro, which is used when XZ is configured with the --enable-small option.
... and I did the same for AArch64, using CSEL instruction. (patch updated)
But I only have Allwinner H616 (Cortex-A53) for testing:
linux-5.15.7.tar.xz : 25.12 --> 23.85 (+5%)
linux-firmware-20211027.tar.xz : 22.80 --> 21.10 (+8%)
AArch64, AWS Graviton2 (c6g.xlarge size):
Before:
Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):
8751.09 msec task-clock # 1.000 CPUs utilized ( +- 0.02% )
44 context-switches # 0.005 K/sec ( +- 5.47% )
0 cpu-migrations # 0.000 K/sec ( +- 61.24% )
18112 page-faults # 0.002 M/sec ( +- 0.05% )
21832211556 cycles # 2.495 GHz ( +- 0.02% )
30273778750 instructions # 1.39 insn per cycle ( +- 0.01% )
<not supported> branches
535339600 branch-misses ( +- 0.01% )
8.75277 +- 0.00198 seconds time elapsed ( +- 0.02% )
After:
Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):
8111.05 msec task-clock # 1.000 CPUs utilized ( +- 0.01% )
48 context-switches # 0.006 K/sec ( +- 2.54% )
1 cpu-migrations # 0.000 K/sec ( +- 31.62% )
18088 page-faults # 0.002 M/sec ( +- 0.01% )
20232553634 cycles # 2.494 GHz ( +- 0.01% )
34157801215 instructions # 1.69 insn per cycle ( +- 0.00% )
<not supported> branches
159725908 branch-misses ( +- 0.02% )
8.112706 +- 0.000784 seconds time elapsed ( +- 0.01% )
~7.9% speedup.
Results from ALT Linux team:
Comet Lake:
linux-5.15.8.tar.xz : 5.81 --> 5.30 (+9.6%)
linux-firmware-20211027.tar.xz : 6.14 --> 5.31 (+15.6%)
Intel Xeon (Ivy Bridge EP):
linux-5.15.8.tar.xz : 7.987+-0.363 --> 7.128+-0.288 ( 10.75+-5.80 % )
linux-firmware-20211027.tar.xz : 8.552+-0.354 --> 7.059+-0.310 ( 17.46+-5.51 % )
I wrote scripts to make testing easier:
# downloads test files and builds XZ
./build.sh
# tests decompression speed
./bench.sh ref
./bench.sh new
build.sh
#!/bin/bash
set -e
# patch
[ -f faster_lzma_decoder_x86.patch ] || wget https://gist.githubusercontent.com/ilyakurdyukov/f514418f3affd677e1ac408ec0c09188/raw/faster_lzma_decoder_x86.patch
# test data
[ -f linux-5.15.7.tar.xz ] || wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.7.tar.xz
[ -f linux-firmware-20211027.tar.xz ] || wget https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz
[ -d xz ] || git clone http://git.tukaani.org/xz.git
mkdir -p ref
mkdir -p new
# build ref
cp -r xz build
cd build
# or cmake as an option
./autogen.sh --no-po4a
./configure
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../ref/
# build new
patch -p1 < ../faster_lzma_decoder_x86.patch
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../new/
cd ..
bench.sh
#!/bin/bash
dir=${1:-"ref"}
export LD_LIBRARY_PATH=$(pwd)/$dir
# to make sure the correct liblzma is loaded
ldd $dir/xz | grep liblzma
# or "perf stat -r 5" instead of "for" and "time -p"
for i in 1 2 3; do
echo "# ($dir:$i) linux-5.15.7"
time -p $dir/xz -c -d linux-5.15.7.tar.xz > /dev/null
echo "# ($dir:$i) linux-firmware-20211027"
time -p $dir/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
doneKunpeng-920 (AArch64)
linux-5.15.7.tar.xz : 8.82 --> 8.78 (+0.4%)
linux-firmware-20211027.tar.xz : 8.97 --> 8.44 (+6.2%)
I got a report that this patch is slowing decoding on the Apple M1, but I am unable to investigate this on my own.
Well, at least it will be easy to exclude it for Apple M1 by adding some #ifndef.
@amonakov, yes it is a good archive for testing binary data decompression performance.
So, on Skylake before:
And after:
Speedup by almost 29%.