Skip to content

Commit 248542a

Browse files
committed
Add AVX2 implementation of first_max_element
This also now requires BMI1 and BMI2 for AVX2 in `CpuFeatureLevel`.
1 parent 2ec4e67 commit 248542a

File tree

6 files changed

+101
-51
lines changed

6 files changed

+101
-51
lines changed

src/asm/x86/dist/hbd.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ macro_rules! satd_hbd_avx2 {
55
($(($W:expr, $H:expr)),*) => {
66
$(
77
paste::item! {
8-
#[target_feature(enable = "avx2")]
8+
#[target_feature(enable = "avx2,bmi1,bmi2")]
99
pub(crate) unsafe extern fn [<rav1e_satd_ $W x $H _hbd_avx2>](
1010
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
1111
) -> u32 {
@@ -43,7 +43,7 @@ macro_rules! satd_kernel_hbd_avx2 {
4343
($(($W:expr, $H:expr)),*) => {
4444
$(
4545
paste::item! {
46-
#[target_feature(enable = "avx2")]
46+
#[target_feature(enable = "avx2,bmi1,bmi2")]
4747
unsafe extern fn [<satd_kernel_ $W x $H _hbd_avx2>](
4848
src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize,
4949
) -> u64 {

src/asm/x86/lrf.rs

+10-10
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ static X_BY_XPLUS1: [u32; 256] = [
158158
];
159159

160160
#[inline]
161-
#[target_feature(enable = "avx2")]
161+
#[target_feature(enable = "avx2,bmi1,bmi2")]
162162
unsafe fn sgrproj_box_ab_8_avx2(
163163
r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
164164
iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
@@ -169,7 +169,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
169169

170170
// Using an integral image, compute the sum of a square region
171171
#[inline]
172-
#[target_feature(enable = "avx2")]
172+
#[target_feature(enable = "avx2,bmi1,bmi2")]
173173
unsafe fn get_integral_square_avx2(
174174
iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
175175
) -> __m256i {
@@ -234,7 +234,7 @@ unsafe fn sgrproj_box_ab_8_avx2(
234234
_mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
235235
}
236236

237-
#[target_feature(enable = "avx2")]
237+
#[target_feature(enable = "avx2,bmi1,bmi2")]
238238
pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
239239
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
240240
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
@@ -293,7 +293,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
293293
}
294294
}
295295

296-
#[target_feature(enable = "avx2")]
296+
#[target_feature(enable = "avx2,bmi1,bmi2")]
297297
pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
298298
af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
299299
iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
@@ -353,7 +353,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
353353
}
354354

355355
#[inline]
356-
#[target_feature(enable = "avx2")]
356+
#[target_feature(enable = "avx2,bmi1,bmi2")]
357357
unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
358358
f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
359359
) {
@@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
374374
);
375375
}
376376

377-
#[target_feature(enable = "avx2")]
377+
#[target_feature(enable = "avx2,bmi1,bmi2")]
378378
pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
379379
f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
380380
) {
@@ -396,7 +396,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
396396
}
397397

398398
#[inline]
399-
#[target_feature(enable = "avx2")]
399+
#[target_feature(enable = "avx2,bmi1,bmi2")]
400400
unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
401401
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
402402
cdeffed: &PlaneSlice<T>,
@@ -496,7 +496,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
496496
);
497497
}
498498

499-
#[target_feature(enable = "avx2")]
499+
#[target_feature(enable = "avx2,bmi1,bmi2")]
500500
pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
501501
af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
502502
cdeffed: &PlaneSlice<T>,
@@ -519,7 +519,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
519519
}
520520

521521
#[inline]
522-
#[target_feature(enable = "avx2")]
522+
#[target_feature(enable = "avx2,bmi1,bmi2")]
523523
unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
524524
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
525525
x: usize, y: usize, cdeffed: &PlaneSlice<T>,
@@ -618,7 +618,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
618618
);
619619
}
620620

621-
#[target_feature(enable = "avx2")]
621+
#[target_feature(enable = "avx2,bmi1,bmi2")]
622622
pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
623623
af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
624624
y: usize, w: usize, cdeffed: &PlaneSlice<T>,

src/asm/x86/quantize.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ pub fn dequantize<T: Coefficient>(
8585
}
8686
}
8787

88-
#[target_feature(enable = "avx2")]
88+
#[target_feature(enable = "avx2,bmi1,bmi2")]
8989
unsafe fn dequantize_avx2(
9090
qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
9191
tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,

src/asm/x86/transform/forward.rs

+19-19
Original file line numberDiff line numberDiff line change
@@ -63,27 +63,27 @@ struct I32X8 {
6363
}
6464

6565
impl I32X8 {
66-
#[target_feature(enable = "avx2")]
66+
#[target_feature(enable = "avx2,bmi1,bmi2")]
6767
#[inline]
6868
const unsafe fn vec(self) -> __m256i {
6969
self.data
7070
}
7171

72-
#[target_feature(enable = "avx2")]
72+
#[target_feature(enable = "avx2,bmi1,bmi2")]
7373
#[inline]
7474
const unsafe fn new(a: __m256i) -> I32X8 {
7575
I32X8 { data: a }
7676
}
7777
}
7878

7979
impl TxOperations for I32X8 {
80-
#[target_feature(enable = "avx2")]
80+
#[target_feature(enable = "avx2,bmi1,bmi2")]
8181
#[inline]
8282
unsafe fn zero() -> Self {
8383
I32X8::new(_mm256_setzero_si256())
8484
}
8585

86-
#[target_feature(enable = "avx2")]
86+
#[target_feature(enable = "avx2,bmi1,bmi2")]
8787
#[inline]
8888
unsafe fn tx_mul(self, mul: (i32, i32)) -> Self {
8989
I32X8::new(_mm256_srav_epi32(
@@ -95,7 +95,7 @@ impl TxOperations for I32X8 {
9595
))
9696
}
9797

98-
#[target_feature(enable = "avx2")]
98+
#[target_feature(enable = "avx2,bmi1,bmi2")]
9999
#[inline]
100100
unsafe fn rshift1(self) -> Self {
101101
I32X8::new(_mm256_srai_epi32(
@@ -107,34 +107,34 @@ impl TxOperations for I32X8 {
107107
))
108108
}
109109

110-
#[target_feature(enable = "avx2")]
110+
#[target_feature(enable = "avx2,bmi1,bmi2")]
111111
#[inline]
112112
unsafe fn add(self, b: Self) -> Self {
113113
I32X8::new(_mm256_add_epi32(self.vec(), b.vec()))
114114
}
115115

116-
#[target_feature(enable = "avx2")]
116+
#[target_feature(enable = "avx2,bmi1,bmi2")]
117117
#[inline]
118118
unsafe fn sub(self, b: Self) -> Self {
119119
I32X8::new(_mm256_sub_epi32(self.vec(), b.vec()))
120120
}
121121

122-
#[target_feature(enable = "avx2")]
122+
#[target_feature(enable = "avx2,bmi1,bmi2")]
123123
#[inline]
124124
unsafe fn add_avg(self, b: Self) -> Self {
125125
I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1))
126126
}
127127

128-
#[target_feature(enable = "avx2")]
128+
#[target_feature(enable = "avx2,bmi1,bmi2")]
129129
#[inline]
130130
unsafe fn sub_avg(self, b: Self) -> Self {
131131
I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1))
132132
}
133133
}
134134

135-
impl_1d_tx!(target_feature(enable = "avx2"), unsafe);
135+
impl_1d_tx!(target_feature(enable = "avx2,bmi1,bmi2"), unsafe);
136136

137-
#[target_feature(enable = "avx2")]
137+
#[target_feature(enable = "avx2,bmi1,bmi2")]
138138
unsafe fn transpose_8x8_avx2(
139139
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
140140
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
@@ -175,7 +175,7 @@ unsafe fn transpose_8x8_avx2(
175175
)
176176
}
177177

178-
#[target_feature(enable = "avx2")]
178+
#[target_feature(enable = "avx2,bmi1,bmi2")]
179179
unsafe fn transpose_8x4_avx2(
180180
input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8),
181181
) -> (I32X8, I32X8, I32X8, I32X8) {
@@ -213,7 +213,7 @@ unsafe fn transpose_8x4_avx2(
213213
)
214214
}
215215

216-
#[target_feature(enable = "avx2")]
216+
#[target_feature(enable = "avx2,bmi1,bmi2")]
217217
unsafe fn transpose_4x8_avx2(
218218
input: (I32X8, I32X8, I32X8, I32X8),
219219
) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) {
@@ -246,7 +246,7 @@ unsafe fn transpose_4x8_avx2(
246246
)
247247
}
248248

249-
#[target_feature(enable = "avx2")]
249+
#[target_feature(enable = "avx2,bmi1,bmi2")]
250250
unsafe fn transpose_4x4_avx2(
251251
input: (I32X8, I32X8, I32X8, I32X8),
252252
) -> (I32X8, I32X8, I32X8, I32X8) {
@@ -265,13 +265,13 @@ unsafe fn transpose_4x4_avx2(
265265
)
266266
}
267267

268-
#[target_feature(enable = "avx2")]
268+
#[target_feature(enable = "avx2,bmi1,bmi2")]
269269
#[inline]
270270
unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 {
271271
I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32)))
272272
}
273273

274-
#[target_feature(enable = "avx2")]
274+
#[target_feature(enable = "avx2,bmi1,bmi2")]
275275
#[inline]
276276
unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
277277
I32X8::new(_mm256_srav_epi32(
@@ -280,7 +280,7 @@ unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 {
280280
))
281281
}
282282

283-
#[target_feature(enable = "avx2")]
283+
#[target_feature(enable = "avx2,bmi1,bmi2")]
284284
#[inline]
285285
unsafe fn round_shift_array_avx2(arr: &mut [I32X8], size: usize, bit: i8) {
286286
if bit == 0 {
@@ -328,7 +328,7 @@ impl SizeClass1D {
328328
}
329329

330330
#[allow(clippy::identity_op, clippy::erasing_op)]
331-
#[target_feature(enable = "avx2")]
331+
#[target_feature(enable = "avx2,bmi1,bmi2")]
332332
unsafe fn forward_transform_avx2<T: Coefficient>(
333333
input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
334334
tx_type: TxType, bd: usize,
@@ -355,7 +355,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
355355
// Columns
356356
for cg in (0..txfm_size_col).step_by(8) {
357357
let shift = cfg.shift[0] as u8;
358-
#[target_feature(enable = "avx2")]
358+
#[target_feature(enable = "avx2,bmi1,bmi2")]
359359
#[inline]
360360
unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 {
361361
// TODO: load 64-bits for x4 wide columns

src/cdef.rs

+65-18
Original file line numberDiff line numberDiff line change
@@ -59,20 +59,58 @@ pub(crate) mod rust {
5959
///
6060
/// # Arguments
6161
///
62-
/// * `elems` - A non-empty slice of integers
63-
///
64-
/// # Panics
65-
///
66-
/// Panics if `elems` is empty
62+
/// * `elems` - A slice of 8 `i32`s
6763
#[inline]
68-
fn first_max_element(elems: &[i32]) -> (usize, i32) {
69-
// In case of a tie, the first element must be selected.
70-
let (max_idx, max_value) = elems
71-
.iter()
72-
.enumerate()
73-
.max_by_key(|&(i, v)| (v, -(i as isize)))
74-
.unwrap();
75-
(max_idx, *max_value)
64+
#[allow(clippy::collapsible_if)]
65+
fn first_max_element(
66+
elems: &[i32; 8], cpu: CpuFeatureLevel,
67+
) -> (usize, i32) {
68+
// Same as `first_max_element`, but implemented with AVX2 intrinsics
69+
#[inline]
70+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
71+
#[target_feature(enable = "avx2,bmi1,bmi2")]
72+
unsafe fn first_max_element_avx2(elems: &[i32; 8]) -> (usize, i32) {
73+
#[cfg(target_arch = "x86")]
74+
use std::arch::x86::*;
75+
#[cfg(target_arch = "x86_64")]
76+
use std::arch::x86_64::*;
77+
78+
// the compiler autovectorizes this
79+
let max_val = *elems.iter().max().unwrap();
80+
81+
let cmp = _mm256_cmpeq_epi32(
82+
_mm256_loadu_si256(elems as *const i32 as *const _),
83+
_mm256_set1_epi32(max_val),
84+
);
85+
// this intrinsic is supposed to be for floating point, but it works
86+
// fine on integer data as well
87+
let mask = _mm256_movemask_ps(std::mem::transmute(cmp));
88+
89+
(mask.trailing_zeros() as usize, max_val)
90+
}
91+
92+
#[inline]
93+
fn _first_max_element(elems: &[i32; 8]) -> (usize, i32) {
94+
// In case of a tie, the first element must be selected.
95+
let (max_idx, max_value) = elems
96+
.iter()
97+
.enumerate()
98+
.max_by_key(|&(i, v)| (v, -(i as isize)))
99+
.unwrap();
100+
(max_idx, *max_value)
101+
}
102+
103+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
104+
if cpu >= CpuFeatureLevel::AVX2 {
105+
let result = unsafe { first_max_element_avx2(elems) };
106+
107+
#[cfg(feature = "check_asm")]
108+
assert_eq!(result, _first_max_element(elems));
109+
110+
return result;
111+
}
112+
113+
_first_max_element(elems)
76114
}
77115

78116
// Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
@@ -84,7 +122,7 @@ pub(crate) mod rust {
84122
// http://jmvalin.ca/notes/intra_paint.pdf
85123
pub fn cdef_find_dir<T: Pixel>(
86124
img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize,
87-
_cpu: CpuFeatureLevel,
125+
cpu: CpuFeatureLevel,
88126
) -> i32 {
89127
let mut cost: [i32; 8] = [0; 8];
90128
let mut partial: [[i32; 15]; 8] = [[0; 15]; 8];
@@ -133,7 +171,7 @@ pub(crate) mod rust {
133171
}
134172
}
135173

136-
let (best_dir, best_cost) = first_max_element(&cost);
174+
let (best_dir, best_cost) = first_max_element(&cost, cpu);
137175
// Difference between the optimal variance and the variance along the
138176
// orthogonal direction. Again, the sum(x^2) terms cancel out.
139177
// We'd normally divide by 840, but dividing by 1024 is close enough
@@ -305,9 +343,18 @@ pub(crate) mod rust {
305343

306344
#[test]
307345
fn check_max_element() {
308-
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6));
309-
assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7));
310-
assert_eq!(first_max_element(&[0, 0]), (0, 0));
346+
assert_eq!(
347+
first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6], CpuFeatureLevel::RUST),
348+
(6, 6)
349+
);
350+
assert_eq!(
351+
first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6], CpuFeatureLevel::RUST),
352+
(6, 7)
353+
);
354+
assert_eq!(
355+
first_max_element(&[0, 0, 0, 0, 0, 0, 0, 0], CpuFeatureLevel::RUST),
356+
(0, 0)
357+
);
311358
}
312359
}
313360
}

src/cpu_features/x86.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,10 @@ impl Default for CpuFeatureLevel {
6262
CpuFeatureLevel::AVX512ICL
6363
} else if avx512_detected() {
6464
CpuFeatureLevel::AVX512
65-
} else if is_x86_feature_detected!("avx2") {
65+
} else if is_x86_feature_detected!("avx2")
66+
&& is_x86_feature_detected!("bmi1")
67+
&& is_x86_feature_detected!("bmi2")
68+
{
6669
CpuFeatureLevel::AVX2
6770
} else if is_x86_feature_detected!("sse4.1") {
6871
CpuFeatureLevel::SSE4_1

0 commit comments

Comments
 (0)