xiph · Oct 18, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 20, 2023
diff --git a/benches/mc.rs b/benches/mc.rs
@@ -263,7 +263,7 @@ fn bench_prep_8tap_top_left_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -294,7 +294,7 @@ fn bench_prep_8tap_top_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -325,7 +325,7 @@ fn bench_prep_8tap_left_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -356,7 +356,7 @@ fn bench_prep_8tap_center_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -387,7 +387,7 @@ fn bench_prep_8tap_top_left_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -418,7 +418,7 @@ fn bench_prep_8tap_top_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -449,7 +449,7 @@ fn bench_prep_8tap_left_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -480,7 +480,7 @@ fn bench_prep_8tap_center_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,

diff --git a/benches/transform.rs b/benches/transform.rs
@@ -15,6 +15,7 @@ use rav1e::bench::transform;
 use rav1e::bench::transform::{
   forward_transform, get_valid_txfm_types, TxSize,
 };
+use std::mem::MaybeUninit;
 
 fn init_buffers(size: usize) -> (Vec<i32>, Vec<i32>) {
   let mut ra = ChaChaRng::from_seed([0; 32]);
@@ -96,7 +97,7 @@ pub fn bench_forward_transforms(c: &mut Criterion) {
 
     let input: Vec<i16> =
       (0..area).map(|_| rng.gen_range(-255..256)).collect();
-    let mut output = vec![0i16; area];
+    let mut output = vec![MaybeUninit::new(0i16); area];
 
     for &tx_type in get_valid_txfm_types(tx_size) {
       group.bench_function(

diff --git a/src/asm/aarch64/transform/inverse.rs b/src/asm/aarch64/transform/inverse.rs
@@ -20,6 +20,22 @@ pub fn inverse_transform_add<T: Pixel>(
   input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
   tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
 ) {
+  if tx_type == TxType::WHT_WHT {
+    debug_assert!(tx_size == TxSize::TX_4X4);
+    match T::type_enum() {
+      PixelType::U8 => {
+        if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
+          return call_inverse_func(func, input, output, eob, 4, 4, bd);
+        }
+      }
+      PixelType::U16 if bd == 10 => {
+        if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
+          return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
+        }
+      }
+      PixelType::U16 => {}
+    }
+  }
   match T::type_enum() {
     PixelType::U8 => {
       if let Some(func) = INV_TXFM_FNS[cpu.as_index()]
@@ -57,6 +73,32 @@ pub fn inverse_transform_add<T: Pixel>(
   rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
 }
 
+extern {
+  fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon(
+    dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+  );
+  fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon(
+    dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+    bitdepth_max: i32,
+  );
+}
+const INV_TXFM_WHT_FN_NEON: Option<InvTxfmFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon as _);
+const INV_TXFM_WHT_HBD_FN_NEON: Option<InvTxfmHBDFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon as _);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
+  default: None,
+  [NEON]
+);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
+  default: None,
+  [NEON]
+);
+
 macro_rules! decl_itx_fns {
   // Takes a 2d list of tx types for W and H
   ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
@@ -100,7 +142,7 @@ macro_rules! decl_itx_hbd_fns {
             // Note: type1 and type2 are flipped
             fn [<rav1e_inv_txfm_add_ $TYPE2 _$TYPE1 _$W x $H _16bpc_$OPT_LOWER>](
               dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16,
-              eob: i32,
+              eob: i32, bitdepth_max: i32,
             );
           }
         )*

diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs
@@ -39,18 +39,10 @@ mod test {
 
   fn pred_matches_inner<T: Pixel>(cpu: CpuFeatureLevel, bit_depth: usize) {
     let tx_size = TxSize::TX_4X4;
-    // SAFETY: We write to the array below before reading from it.
-    let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() };
-    for i in 0..ac.data.len() {
-      ac.data[i] = i as i16 - 16 * 32;
-    }
-    // SAFETY: We write to the array below before reading from it.
-    let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> =
-      unsafe { Aligned::uninitialized() };
-    for i in 0..edge_buf.data.len() {
-      edge_buf.data[i] =
-        T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE));
-    }
+    let ac: Aligned<[i16; 32 * 32]> = Aligned::from_fn(|i| i as i16 - 16 * 32);
+    let edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> = Aligned::from_fn(|i| {
+      T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE))
+    });
 
     let ief_params_all = [
       None,

diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
@@ -9,13 +9,14 @@
 
 use crate::tiling::PlaneRegionMut;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 // Note: Input coeffs are mutable since the assembly uses them as a scratchpad
 pub type InvTxfmFunc =
   unsafe extern fn(*mut u8, libc::ptrdiff_t, *mut i16, i32);
 
 pub type InvTxfmHBDFunc =
-  unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32);
+  unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32, i32);
 
 pub fn call_inverse_func<T: Pixel>(
   func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
@@ -27,13 +28,13 @@ pub fn call_inverse_func<T: Pixel>(
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -51,19 +52,19 @@ pub fn call_inverse_func<T: Pixel>(
 pub fn call_inverse_hbd_func<T: Pixel>(
   func: InvTxfmHBDFunc, input: &[T::Coeff],
   output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
-  _bd: usize,
+  bd: usize,
 ) {
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -74,6 +75,7 @@ pub fn call_inverse_hbd_func<T: Pixel>(
       T::to_asm_stride(output.plane_cfg.stride),
       copied.data.as_mut_ptr() as *mut _,
       eob as i32 - 1,
+      (1 << bd) - 1,
     );
   }
 }
@@ -88,6 +90,7 @@ pub mod test {
   use crate::transform::TxSize::*;
   use crate::transform::*;
   use rand::{random, thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   pub fn pick_eob<T: Coefficient>(
     coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize,
@@ -105,7 +108,8 @@ pub mod test {
     let mut eob = 0;
     let mut exit = 0;
 
-    let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan;
+    // Wrap WHT_WHT (16) to DCT_DCT (0) scan table
+    let scan = av1_scan_orders[tx_size as usize][(tx_type as usize) & 15].scan;
 
     for (i, &pos) in scan.iter().enumerate() {
       exit = i;
@@ -145,22 +149,26 @@ pub mod test {
     for sub_h in 0..sub_h_iterations {
       let mut src_storage = [T::zero(); 64 * 64];
       let src = &mut src_storage[..tx_size.area()];
-      let mut dst = Plane::from_slice(&[T::zero(); 64 * 64], 64);
-      // SAFETY: We write to the array below before reading from it.
-      let mut res_storage: Aligned<[i16; 64 * 64]> =
+      let mut dst = Plane::from_slice(
+        &[T::zero(); 64 * 64][..tx_size.area()],
+        tx_size.width(),
+      );
+      let mut res_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let res = &mut res_storage.data[..tx_size.area()];
       // SAFETY: We write to the array below before reading from it.
-      let mut freq_storage: Aligned<[T::Coeff; 64 * 64]> =
+      let mut freq_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let freq = &mut freq_storage.data[..tx_size.area()];
       for ((r, s), d) in
         res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut())
       {
         *s = T::cast_from(random::<u16>() >> (16 - bit_depth));
         *d = T::cast_from(random::<u16>() >> (16 - bit_depth));
-        *r = i16::cast_from(*s) - i16::cast_from(*d);
+        r.write(i16::cast_from(*s) - i16::cast_from(*d));
       }
+      // SAFETY: The loop just initialized res, and all three slices have the same length
+      let res = unsafe { slice_assume_init_mut(res) };
       forward_transform(
         res,
         freq,
@@ -170,6 +178,8 @@ pub mod test {
         bit_depth,
         CpuFeatureLevel::RUST,
       );
+      // SAFETY: forward_transform initialized freq
+      let freq = unsafe { slice_assume_init_mut(freq) };
 
       let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();
@@ -223,13 +233,16 @@ pub mod test {
     };
 
     ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt,
-     $TYPES84:tt, $DIMS84:tt) => {
+     $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt) => {
       test_itx_fns!([$TYPES64], $DIMS64);
       test_itx_fns!([$TYPES64, $TYPES32], $DIMS32);
       test_itx_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16);
       test_itx_fns!(
         [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84
       );
+      test_itx_fns!(
+        [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4
+      );
     };
   }
 
@@ -254,13 +267,16 @@ pub mod test {
       (TxType::FLIPADST_FLIPADST, flipadst, flipadst)
     ],
     [(16, 16)],
-    // 8x, 4x and 16x (minus 16x16)
+    // 8x, 4x and 16x (minus 16x16 and 4x4)
     [
       (TxType::V_ADST, adst, identity),
       (TxType::H_ADST, identity, adst),
       (TxType::V_FLIPADST, flipadst, identity),
       (TxType::H_FLIPADST, identity, flipadst)
     ],
-    [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)]
+    [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)],
+    // 4x4
+    [(TxType::WHT_WHT, wht, wht)],
+    [(4, 4)]
   );
 }
diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
@@ -17,6 +17,7 @@ use crate::cpu_features::CpuFeatureLevel;
 use crate::quantize::*;
 use crate::transform::TxSize;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 type DequantizeFn = unsafe fn(
   qindex: u8,
@@ -37,21 +38,21 @@ cpu_function_lookup_table!(
 
 #[inline(always)]
 pub fn dequantize<T: Coefficient>(
-  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
+  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+  cpu: CpuFeatureLevel,
 ) {
-  let call_rust = |rcoeffs: &mut [T]| {
+  let call_rust = |rcoeffs: &mut [MaybeUninit<T>]| {
     crate::quantize::rust::dequantize(
       qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
       ac_delta_q, cpu,
     );
   };
 
   #[cfg(any(feature = "check_asm", test))]
-  let ref_rcoeffs = {
+  let mut ref_rcoeffs = {
     let area = av1_get_coded_tx_size(tx_size).area();
-    let mut copy = vec![T::cast_from(0); area];
-    copy[..].copy_from_slice(&rcoeffs[..area]);
+    let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area];
     call_rust(&mut copy);
     copy
   };
@@ -82,7 +83,9 @@ pub fn dequantize<T: Coefficient>(
   #[cfg(any(feature = "check_asm", test))]
   {
     let area = av1_get_coded_tx_size(tx_size).area();
-    assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]);
+    let rcoeffs = unsafe { assume_slice_init_mut(&mut rcoeffs[..area]) };
+    let ref_rcoeffs = unsafe { assume_slice_init_mut(&mut ref_rcoeffs[..]) };
+    assert_eq!(rcoeffs, ref_rcoeffs);
   }
 }
 
@@ -157,6 +160,7 @@ mod test {
   use super::*;
   use rand::distributions::{Distribution, Uniform};
   use rand::{thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   #[test]
   fn dequantize_test() {
@@ -190,7 +194,7 @@ mod test {
 
       for &eob in &eobs {
         let mut qcoeffs = Aligned::new([0i16; 32 * 32]);
-        let mut rcoeffs = Aligned::new([0i16; 32 * 32]);
+        let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]);
 
         // Generate quantized coefficients up to the eob
         let between = Uniform::from(-i16::MAX..=i16::MAX);