Skip to content

Commit a22fb64

Browse files
committed
I grow annoyed
1 parent 4e99fc0 commit a22fb64

File tree

2 files changed

+86
-44
lines changed

2 files changed

+86
-44
lines changed

src/denoise/mod.rs

+29-28
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use crate::api::FrameQueue;
22
use crate::cpu_features::CpuFeatureLevel;
3+
use crate::util::{Aligned, AlignedBoxedSlice};
34
use crate::EncoderStatus;
45
use arrayvec::ArrayVec;
56
use cfg_if::cfg_if;
@@ -296,13 +297,13 @@ where
296297
) -> &mut (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>);
297298

298299
fn do_filtering(&mut self, src: &[[Plane<T>; 3]], dest: &mut Frame<T>) {
299-
let mut dftr = [0f32; BLOCK_VOLUME];
300-
let mut dftc = [Complex::<f32>::default(); COMPLEX_COUNT];
301-
let mut means = [Complex::<f32>::default(); COMPLEX_COUNT];
300+
let mut dftr = Aligned::new([0f32; BLOCK_VOLUME]);
301+
let mut dftc = Aligned::new([Complex::<f32>::default(); COMPLEX_COUNT]);
302+
let mut means = Aligned::new([Complex::<f32>::default(); COMPLEX_COUNT]);
302303

303304
for p in 0..3 {
304305
let (pad_width, pad_height) = self.pad_dimensions(p);
305-
let mut ebuff = vec![0f32; pad_width * pad_height];
306+
let mut ebuff = AlignedBoxedSlice::new(pad_width * pad_height, 0f32);
306307
let effective_height = self.effective_height(p);
307308
let src_stride = src[0][p].cfg.stride;
308309
let ebuff_stride = pad_width;
@@ -324,23 +325,23 @@ where
324325
self.proc0(
325326
src_planes[z].get_unchecked(x..),
326327
self.hw().get_unchecked((BLOCK_AREA * z)..),
327-
dftr.get_unchecked_mut((BLOCK_AREA * z)..),
328+
dftr.data.get_unchecked_mut((BLOCK_AREA * z)..),
328329
src_stride,
329330
SB_SIZE,
330331
self.src_scale(),
331332
);
332333
}
333334

334-
self.real_to_complex_3d(&dftr, &mut dftc);
335-
self.remove_mean(&mut dftc, self.dftgc(), &mut means);
335+
self.real_to_complex_3d(&dftr.data, &mut dftc.data);
336+
self.remove_mean(&mut dftc.data, self.dftgc(), &mut means.data);
336337

337-
self.filter_coeffs(&mut dftc);
338+
self.filter_coeffs(&mut dftc.data);
338339

339-
self.add_mean(&mut dftc, &means);
340-
self.complex_to_real_3d(&dftc, &mut dftr);
340+
self.add_mean(&mut dftc.data, &means.data);
341+
self.complex_to_real_3d(&dftc.data, &mut dftr.data);
341342

342343
self.proc1(
343-
dftr.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
344+
dftr.data.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
344345
self.hw().get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
345346
ebuff.get_unchecked_mut((y * ebuff_stride + x)..),
346347
SB_SIZE,
@@ -405,7 +406,7 @@ where
405406
let s0 = s0.add(u * p0 + v);
406407
let s1 = s1.add(u * p0 + v);
407408
let dest = dest.add(u * p1 + v);
408-
dest.write(dest.read() + s0.read() * s1.read());
409+
dest.write(s0.read().mul_add(s1.read(), dest.read()));
409410
}
410411
}
411412
}
@@ -693,10 +694,10 @@ where
693694
pad_dimensions: ArrayVec<(usize, usize), 3>,
694695
effective_heights: ArrayVec<usize, 3>,
695696

696-
hw: [f32; BLOCK_VOLUME],
697-
dftgc: [Complex<f32>; COMPLEX_COUNT],
697+
hw: Aligned<[f32; BLOCK_VOLUME]>,
698+
dftgc: Aligned<[Complex<f32>; COMPLEX_COUNT]>,
698699
fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
699-
sigmas: [f32; CCNT2],
700+
sigmas: Aligned<[f32; CCNT2]>,
700701
}
701702

702703
impl<T> DftDenoiserRust<T>
@@ -708,8 +709,8 @@ where
708709
pad_dimensions: ArrayVec<(usize, usize), 3>,
709710
effective_heights: ArrayVec<usize, 3>,
710711
) -> Self {
711-
let hw = create_window();
712-
let mut dftgr = [0f32; BLOCK_VOLUME];
712+
let hw = Aligned::new(create_window());
713+
let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]);
713714

714715
let fft = (
715716
R2cFftHandler::new(SB_SIZE),
@@ -719,15 +720,15 @@ where
719720

720721
let mut wscale = 0.0f32;
721722
for k in 0..BLOCK_VOLUME {
722-
dftgr[k] = 255.0 * hw[k];
723-
wscale += hw[k].powi(2);
723+
dftgr.data[k] = 255.0 * hw.data[k];
724+
wscale += hw.data[k].powi(2);
724725
}
725726
let wscale = 1.0 / wscale;
726727

727-
let mut sigmas = [0f32; CCNT2];
728-
sigmas.fill(sigma / wscale);
728+
let mut sigmas = Aligned::new([0f32; CCNT2]);
729+
sigmas.data.fill(sigma / wscale);
729730

730-
let mut denoiser = DftDenoiserRust {
731+
let mut denoiser = Self {
731732
dest_scale,
732733
src_scale,
733734
peak,
@@ -736,11 +737,11 @@ where
736737
hw,
737738
fft,
738739
sigmas,
739-
dftgc: [Complex::default(); COMPLEX_COUNT],
740+
dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]),
740741
};
741742

742-
let mut dftgc = [Complex::default(); COMPLEX_COUNT];
743-
denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
743+
let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]);
744+
denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data);
744745
denoiser.dftgc = dftgc;
745746

746747
denoiser
@@ -778,17 +779,17 @@ where
778779

779780
#[inline(always)]
780781
fn hw(&self) -> &[f32; BLOCK_VOLUME] {
781-
&self.hw
782+
&self.hw.data
782783
}
783784

784785
#[inline(always)]
785786
fn dftgc(&self) -> &[Complex<f32>; COMPLEX_COUNT] {
786-
&self.dftgc
787+
&self.dftgc.data
787788
}
788789

789790
#[inline(always)]
790791
fn sigmas(&self) -> &[f32; CCNT2] {
791-
&self.sigmas
792+
&self.sigmas.data
792793
}
793794

794795
#[inline(always)]

src/denoise/x86.rs

+57-16
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
1+
use crate::util::Aligned;
12
use arrayvec::ArrayVec;
23
use ndrustfft::{Complex, FftHandler, R2cFftHandler};
4+
use std::arch::x86_64::{
5+
_mm256_castsi256_ps, _mm256_cvtepu16_epi32, _mm256_cvtepu8_epi32,
6+
_mm256_load_ps, _mm256_mul_ps, _mm256_set1_ps, _mm256_store_ps,
7+
_mm_load_si128,
8+
};
9+
use std::mem::size_of;
310
use v_frame::pixel::Pixel;
411

512
use super::{
@@ -19,10 +26,10 @@ where
1926
pad_dimensions: ArrayVec<(usize, usize), 3>,
2027
effective_heights: ArrayVec<usize, 3>,
2128

22-
hw: [f32; BLOCK_VOLUME],
23-
dftgc: [Complex<f32>; COMPLEX_COUNT],
29+
hw: Aligned<[f32; BLOCK_VOLUME]>,
30+
dftgc: Aligned<[Complex<f32>; COMPLEX_COUNT]>,
2431
fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
25-
sigmas: [f32; CCNT2],
32+
sigmas: Aligned<[f32; CCNT2]>,
2633
}
2734

2835
impl<T> DftDenoiserAvx2<T>
@@ -35,8 +42,8 @@ where
3542
pad_dimensions: ArrayVec<(usize, usize), 3>,
3643
effective_heights: ArrayVec<usize, 3>,
3744
) -> Self {
38-
let hw = create_window();
39-
let mut dftgr = [0f32; BLOCK_VOLUME];
45+
let hw = Aligned::new(create_window());
46+
let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]);
4047

4148
let fft = (
4249
R2cFftHandler::new(SB_SIZE),
@@ -46,15 +53,15 @@ where
4653

4754
let mut wscale = 0.0f32;
4855
for k in 0..BLOCK_VOLUME {
49-
dftgr[k] = 255.0 * hw[k];
50-
wscale += hw[k].powi(2);
56+
dftgr.data[k] = 255.0 * hw.data[k];
57+
wscale += hw.data[k].powi(2);
5158
}
5259
let wscale = 1.0 / wscale;
5360

54-
let mut sigmas = [0f32; CCNT2];
55-
sigmas.fill(sigma / wscale);
61+
let mut sigmas = Aligned::new([0f32; CCNT2]);
62+
sigmas.data.fill(sigma / wscale);
5663

57-
let mut denoiser = DftDenoiserAvx2 {
64+
let mut denoiser = Self {
5865
dest_scale,
5966
src_scale,
6067
peak,
@@ -63,11 +70,11 @@ where
6370
hw,
6471
fft,
6572
sigmas,
66-
dftgc: [Complex::default(); COMPLEX_COUNT],
73+
dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]),
6774
};
6875

69-
let mut dftgc = [Complex::default(); COMPLEX_COUNT];
70-
denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
76+
let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]);
77+
denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data);
7178
denoiser.dftgc = dftgc;
7279

7380
denoiser
@@ -78,6 +85,40 @@ impl<T> Denoiser<T> for DftDenoiserAvx2<T>
7885
where
7986
T: Pixel,
8087
{
88+
#[inline]
89+
unsafe fn proc0(
90+
&self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
91+
src_scale: f32,
92+
) {
93+
let s0 = s0.as_ptr();
94+
let s1 = s1.as_ptr();
95+
let dest = dest.as_mut_ptr();
96+
let src_scale = _mm256_set1_ps(src_scale);
97+
98+
for u in 0..p1 {
99+
for v in (0..p1).step_by(8) {
100+
let s0 = s0.add(u * p0 + v);
101+
let s1 = s1.add(u * p1 + v);
102+
let dest = dest.add(u * p1 + v);
103+
104+
let v_s0 = if size_of::<T>() == 1 {
105+
_mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_load_si128(s0.cast())))
106+
} else {
107+
_mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_load_si128(s0.cast())))
108+
};
109+
let v_s1 = _mm256_load_ps(s1);
110+
if size_of::<T>() == 1 {
111+
_mm256_store_ps(dest, _mm256_mul_ps(v_s0, v_s1));
112+
} else {
113+
_mm256_store_ps(
114+
dest,
115+
_mm256_mul_ps(_mm256_mul_ps(v_s0, src_scale), v_s1),
116+
);
117+
}
118+
}
119+
}
120+
}
121+
81122
#[inline(always)]
82123
fn pad_dimensions(&self, plane: usize) -> (usize, usize) {
83124
self.pad_dimensions[plane]
@@ -105,17 +146,17 @@ where
105146

106147
#[inline(always)]
107148
fn hw(&self) -> &[f32; BLOCK_VOLUME] {
108-
&self.hw
149+
&self.hw.data
109150
}
110151

111152
#[inline(always)]
112153
fn dftgc(&self) -> &[Complex<f32>; COMPLEX_COUNT] {
113-
&self.dftgc
154+
&self.dftgc.data
114155
}
115156

116157
#[inline(always)]
117158
fn sigmas(&self) -> &[f32; CCNT2] {
118-
&self.sigmas
159+
&self.sigmas.data
119160
}
120161

121162
#[inline(always)]

0 commit comments

Comments
 (0)