@@ -235,38 +235,45 @@ where
235
235
. map ( |f| f[ p] . data_origin ( ) )
236
236
. collect :: < ArrayVec < _ , TB_SIZE > > ( ) ;
237
237
238
- for y in ( 0 ..effective_height) . step_by ( INC ) {
239
- for x in ( 0 ..=( pad_width - SB_SIZE ) ) . step_by ( INC ) {
240
- for z in 0 ..TB_SIZE {
241
- self . proc0 (
242
- & src_planes[ z] [ x..] ,
243
- & self . hw [ ( BLOCK_AREA * z) ..] ,
244
- & mut dftr[ ( BLOCK_AREA * z) ..] ,
245
- src_stride,
238
+ // SAFETY: We know the size of the planes we're working on,
239
+ // so we can safely ensure we are not out of bounds.
240
+ // There are a fair number of unsafe function calls here
241
+ // which are unsafe for optimization purposes.
242
+ // All are safe as long as we do not pass out-of-bounds parameters.
243
+ unsafe {
244
+ for y in ( 0 ..effective_height) . step_by ( INC ) {
245
+ for x in ( 0 ..=( pad_width - SB_SIZE ) ) . step_by ( INC ) {
246
+ for z in 0 ..TB_SIZE {
247
+ self . proc0 (
248
+ & src_planes[ z] [ x..] ,
249
+ & self . hw [ ( BLOCK_AREA * z) ..] ,
250
+ & mut dftr[ ( BLOCK_AREA * z) ..] ,
251
+ src_stride,
252
+ SB_SIZE ,
253
+ self . src_scale ,
254
+ ) ;
255
+ }
256
+
257
+ self . real_to_complex_3d ( & dftr, & mut dftc) ;
258
+ self . remove_mean ( & mut dftc, & self . dftgc , & mut means) ;
259
+
260
+ self . filter_coeffs ( & mut dftc) ;
261
+
262
+ self . add_mean ( & mut dftc, & means) ;
263
+ self . complex_to_real_3d ( & dftc, & mut dftr) ;
264
+
265
+ self . proc1 (
266
+ & dftr[ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
267
+ & self . hw [ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
268
+ & mut ebuff[ ( y * ebuff_stride + x) ..] ,
246
269
SB_SIZE ,
247
- self . src_scale ,
270
+ ebuff_stride ,
248
271
) ;
249
272
}
250
273
251
- self . real_to_complex_3d ( & dftr, & mut dftc) ;
252
- self . remove_mean ( & mut dftc, & self . dftgc , & mut means) ;
253
-
254
- self . filter_coeffs ( & mut dftc) ;
255
-
256
- self . add_mean ( & mut dftc, & means) ;
257
- self . complex_to_real_3d ( & dftc, & mut dftr) ;
258
-
259
- self . proc1 (
260
- & dftr[ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
261
- & self . hw [ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
262
- & mut ebuff[ ( y * ebuff_stride + x) ..] ,
263
- SB_SIZE ,
264
- ebuff_stride,
265
- ) ;
266
- }
267
-
268
- for q in 0 ..TB_SIZE {
269
- src_planes[ q] = & src_planes[ q] [ ( INC * src_stride) ..] ;
274
+ for q in 0 ..TB_SIZE {
275
+ src_planes[ q] = & src_planes[ q] [ ( INC * src_stride) ..] ;
276
+ }
270
277
}
271
278
}
272
279
@@ -313,6 +320,7 @@ where
313
320
hw
314
321
}
315
322
323
+ #[ inline( always) ]
316
324
// Hanning windowing
317
325
fn spatial_window ( n : f64 ) -> f64 {
318
326
0.5 - 0.5 * ( 2.0 * PI * n / SB_SIZE as f64 ) . cos ( )
@@ -345,35 +353,44 @@ where
345
353
}
346
354
}
347
355
348
- fn proc0 (
356
+ #[ inline]
357
+ unsafe fn proc0 (
349
358
& self , s0 : & [ T ] , s1 : & [ f32 ] , dest : & mut [ f32 ] , p0 : usize , p1 : usize ,
350
359
src_scale : f32 ,
351
360
) {
352
- let s0 = s0. chunks ( p0 ) ;
353
- let s1 = s1. chunks ( p1 ) ;
354
- let dest = dest. chunks_mut ( p1 ) ;
361
+ let s0 = s0. as_ptr ( ) ;
362
+ let s1 = s1. as_ptr ( ) ;
363
+ let dest = dest. as_mut_ptr ( ) ;
355
364
356
- for ( s0 , ( s1 , dest ) ) in s0 . zip ( s1 . zip ( dest ) ) . take ( p1 ) {
365
+ for u in 0 ..p1 {
357
366
for v in 0 ..p1 {
358
- dest[ v] = u16:: cast_from ( s0[ v] ) as f32 * src_scale * s1[ v] ;
367
+ let s0 = s0. add ( u * p0 + v) ;
368
+ let s1 = s1. add ( u * p1 + v) ;
369
+ let dest = dest. add ( u * p1 + v) ;
370
+ dest. write ( u16:: cast_from ( s0. read ( ) ) as f32 * src_scale * s1. read ( ) )
359
371
}
360
372
}
361
373
}
362
374
363
- fn proc1 (
375
+ #[ inline]
376
+ unsafe fn proc1 (
364
377
& self , s0 : & [ f32 ] , s1 : & [ f32 ] , dest : & mut [ f32 ] , p0 : usize , p1 : usize ,
365
378
) {
366
- let s0 = s0. chunks ( p0 ) ;
367
- let s1 = s1. chunks ( p0 ) ;
368
- let dest = dest. chunks_mut ( p1 ) ;
379
+ let s0 = s0. as_ptr ( ) ;
380
+ let s1 = s1. as_ptr ( ) ;
381
+ let dest = dest. as_mut_ptr ( ) ;
369
382
370
- for ( s0 , ( s1 , dest ) ) in s0 . zip ( s1 . zip ( dest ) ) . take ( p0 ) {
383
+ for u in 0 ..p0 {
371
384
for v in 0 ..p0 {
372
- dest[ v] += s0[ v] * s1[ v] ;
385
+ let s0 = s0. add ( u * p0 + v) ;
386
+ let s1 = s1. add ( u * p0 + v) ;
387
+ let dest = dest. add ( u * p1 + v) ;
388
+ dest. write ( s0. read ( ) . mul_add ( s1. read ( ) , dest. read ( ) ) ) ;
373
389
}
374
390
}
375
391
}
376
392
393
+ #[ inline]
377
394
fn remove_mean (
378
395
& self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ,
379
396
dftgc : & [ Complex < f32 > ; COMPLEX_COUNT ] ,
@@ -389,6 +406,7 @@ where
389
406
}
390
407
}
391
408
409
+ #[ inline]
392
410
fn add_mean (
393
411
& self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ,
394
412
means : & [ Complex < f32 > ; COMPLEX_COUNT ] ,
@@ -399,6 +417,7 @@ where
399
417
}
400
418
}
401
419
420
+ #[ inline]
402
421
// Applies a generalized wiener filter
403
422
fn filter_coeffs ( & self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ) {
404
423
for h in 0 ..COMPLEX_COUNT {
@@ -495,11 +514,8 @@ where
495
514
for ( ebuff, dest) in ebuff. zip ( dest) . take ( dest_height) {
496
515
for x in 0 ..dest_width {
497
516
let fval = ebuff[ x] . mul_add ( self . dest_scale , 0.5 ) ;
498
- dest[ x] = clamp (
499
- T :: cast_from ( fval. round ( ) as u16 ) ,
500
- T :: cast_from ( 0u16 ) ,
501
- self . peak ,
502
- ) ;
517
+ dest[ x] =
518
+ clamp ( T :: cast_from ( fval as u16 ) , T :: cast_from ( 0u16 ) , self . peak ) ;
503
519
}
504
520
}
505
521
}
@@ -544,6 +560,7 @@ where
544
560
}
545
561
}
546
562
563
+ #[ inline( always) ]
547
564
fn extra ( a : usize , b : usize ) -> usize {
548
565
if a % b > 0 {
549
566
b - ( a % b)
0 commit comments