Skip to content

Commit d090077

Browse files
committed
Autovectorization pass
1 parent e0a4b2a commit d090077

File tree

1 file changed

+62
-45
lines changed

1 file changed

+62
-45
lines changed

src/denoise.rs

+62-45
Original file line numberDiff line numberDiff line change
@@ -235,38 +235,45 @@ where
235235
.map(|f| f[p].data_origin())
236236
.collect::<ArrayVec<_, TB_SIZE>>();
237237

238-
for y in (0..effective_height).step_by(INC) {
239-
for x in (0..=(pad_width - SB_SIZE)).step_by(INC) {
240-
for z in 0..TB_SIZE {
241-
self.proc0(
242-
&src_planes[z][x..],
243-
&self.hw[(BLOCK_AREA * z)..],
244-
&mut dftr[(BLOCK_AREA * z)..],
245-
src_stride,
238+
// SAFETY: We know the size of the planes we're working on,
239+
// so we can safely ensure we are not out of bounds.
240+
// There are a fair number of unsafe function calls here
241+
// which are unsafe for optimization purposes.
242+
// All are safe as long as we do not pass out-of-bounds parameters.
243+
unsafe {
244+
for y in (0..effective_height).step_by(INC) {
245+
for x in (0..=(pad_width - SB_SIZE)).step_by(INC) {
246+
for z in 0..TB_SIZE {
247+
self.proc0(
248+
&src_planes[z][x..],
249+
&self.hw[(BLOCK_AREA * z)..],
250+
&mut dftr[(BLOCK_AREA * z)..],
251+
src_stride,
252+
SB_SIZE,
253+
self.src_scale,
254+
);
255+
}
256+
257+
self.real_to_complex_3d(&dftr, &mut dftc);
258+
self.remove_mean(&mut dftc, &self.dftgc, &mut means);
259+
260+
self.filter_coeffs(&mut dftc);
261+
262+
self.add_mean(&mut dftc, &means);
263+
self.complex_to_real_3d(&dftc, &mut dftr);
264+
265+
self.proc1(
266+
&dftr[(TB_MIDPOINT * BLOCK_AREA)..],
267+
&self.hw[(TB_MIDPOINT * BLOCK_AREA)..],
268+
&mut ebuff[(y * ebuff_stride + x)..],
246269
SB_SIZE,
247-
self.src_scale,
270+
ebuff_stride,
248271
);
249272
}
250273

251-
self.real_to_complex_3d(&dftr, &mut dftc);
252-
self.remove_mean(&mut dftc, &self.dftgc, &mut means);
253-
254-
self.filter_coeffs(&mut dftc);
255-
256-
self.add_mean(&mut dftc, &means);
257-
self.complex_to_real_3d(&dftc, &mut dftr);
258-
259-
self.proc1(
260-
&dftr[(TB_MIDPOINT * BLOCK_AREA)..],
261-
&self.hw[(TB_MIDPOINT * BLOCK_AREA)..],
262-
&mut ebuff[(y * ebuff_stride + x)..],
263-
SB_SIZE,
264-
ebuff_stride,
265-
);
266-
}
267-
268-
for q in 0..TB_SIZE {
269-
src_planes[q] = &src_planes[q][(INC * src_stride)..];
274+
for q in 0..TB_SIZE {
275+
src_planes[q] = &src_planes[q][(INC * src_stride)..];
276+
}
270277
}
271278
}
272279

@@ -313,6 +320,7 @@ where
313320
hw
314321
}
315322

323+
#[inline(always)]
316324
// Hanning windowing
317325
fn spatial_window(n: f64) -> f64 {
318326
0.5 - 0.5 * (2.0 * PI * n / SB_SIZE as f64).cos()
@@ -345,35 +353,44 @@ where
345353
}
346354
}
347355

348-
fn proc0(
356+
#[inline]
357+
unsafe fn proc0(
349358
&self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
350359
src_scale: f32,
351360
) {
352-
let s0 = s0.chunks(p0);
353-
let s1 = s1.chunks(p1);
354-
let dest = dest.chunks_mut(p1);
361+
let s0 = s0.as_ptr();
362+
let s1 = s1.as_ptr();
363+
let dest = dest.as_mut_ptr();
355364

356-
for (s0, (s1, dest)) in s0.zip(s1.zip(dest)).take(p1) {
365+
for u in 0..p1 {
357366
for v in 0..p1 {
358-
dest[v] = u16::cast_from(s0[v]) as f32 * src_scale * s1[v];
367+
let s0 = s0.add(u * p0 + v);
368+
let s1 = s1.add(u * p1 + v);
369+
let dest = dest.add(u * p1 + v);
370+
dest.write(u16::cast_from(s0.read()) as f32 * src_scale * s1.read())
359371
}
360372
}
361373
}
362374

363-
fn proc1(
375+
#[inline]
376+
unsafe fn proc1(
364377
&self, s0: &[f32], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
365378
) {
366-
let s0 = s0.chunks(p0);
367-
let s1 = s1.chunks(p0);
368-
let dest = dest.chunks_mut(p1);
379+
let s0 = s0.as_ptr();
380+
let s1 = s1.as_ptr();
381+
let dest = dest.as_mut_ptr();
369382

370-
for (s0, (s1, dest)) in s0.zip(s1.zip(dest)).take(p0) {
383+
for u in 0..p0 {
371384
for v in 0..p0 {
372-
dest[v] += s0[v] * s1[v];
385+
let s0 = s0.add(u * p0 + v);
386+
let s1 = s1.add(u * p0 + v);
387+
let dest = dest.add(u * p1 + v);
388+
dest.write(s0.read().mul_add(s1.read(), dest.read()));
373389
}
374390
}
375391
}
376392

393+
#[inline]
377394
fn remove_mean(
378395
&self, dftc: &mut [Complex<f32>; COMPLEX_COUNT],
379396
dftgc: &[Complex<f32>; COMPLEX_COUNT],
@@ -389,6 +406,7 @@ where
389406
}
390407
}
391408

409+
#[inline]
392410
fn add_mean(
393411
&self, dftc: &mut [Complex<f32>; COMPLEX_COUNT],
394412
means: &[Complex<f32>; COMPLEX_COUNT],
@@ -399,6 +417,7 @@ where
399417
}
400418
}
401419

420+
#[inline]
402421
// Applies a generalized wiener filter
403422
fn filter_coeffs(&self, dftc: &mut [Complex<f32>; COMPLEX_COUNT]) {
404423
for h in 0..COMPLEX_COUNT {
@@ -495,11 +514,8 @@ where
495514
for (ebuff, dest) in ebuff.zip(dest).take(dest_height) {
496515
for x in 0..dest_width {
497516
let fval = ebuff[x].mul_add(self.dest_scale, 0.5);
498-
dest[x] = clamp(
499-
T::cast_from(fval.round() as u16),
500-
T::cast_from(0u16),
501-
self.peak,
502-
);
517+
dest[x] =
518+
clamp(T::cast_from(fval as u16), T::cast_from(0u16), self.peak);
503519
}
504520
}
505521
}
@@ -544,6 +560,7 @@ where
544560
}
545561
}
546562

563+
#[inline(always)]
547564
fn extra(a: usize, b: usize) -> usize {
548565
if a % b > 0 {
549566
b - (a % b)

0 commit comments

Comments
 (0)