diff --git a/src/activity.rs b/src/activity.rs index 5a8400d978..facca82125 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -56,11 +56,11 @@ impl ActivityMask { } #[hawktracer(activity_mask_fill_scales)] - pub fn fill_scales( - &self, bit_depth: usize, activity_scales: &mut Box<[DistortionScale]>, + pub fn fill_scales( + &self, activity_scales: &mut Box<[DistortionScale]>, ) { for (dst, &src) in activity_scales.iter_mut().zip(self.variances.iter()) { - *dst = ssim_boost(src, src, bit_depth); + *dst = ssim_boost::(src, src); } } } @@ -146,21 +146,20 @@ fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput { } #[inline(always)] -pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale { - DistortionScale(apply_ssim_boost( +pub fn ssim_boost(svar: u32, dvar: u32) -> DistortionScale { + DistortionScale(apply_ssim_boost::( DistortionScale::default().0, svar, dvar, - bit_depth, )) } /// Apply ssim boost to a given input #[inline(always)] -pub fn apply_ssim_boost( - input: u32, svar: u32, dvar: u32, bit_depth: usize, +pub fn apply_ssim_boost( + input: u32, svar: u32, dvar: u32, ) -> u32 { - let coeff_shift = bit_depth - 8; + let coeff_shift = BD - 8; // Scale dvar and svar to lbd range to prevent overflows. let svar = (svar >> (2 * coeff_shift)) as u64; @@ -199,7 +198,7 @@ mod ssim_boost_tests { let max_pix_diff = (1 << 12) - 1; let max_pix_sse = max_pix_diff * max_pix_diff; let max_variance = max_pix_diff * 8 * 8 / 4; - apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12); + apply_ssim_boost::<12>(max_pix_sse * 8 * 8, max_variance, max_variance); } /// Floating point reference version of `ssim_boost` @@ -234,8 +233,8 @@ mod ssim_boost_tests { let dvar = rng.gen_range(0..(1 << scale)); let float = reference_ssim_boost(svar, dvar, 12); - let fixed = - apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64; + let fixed = apply_ssim_boost::<12>(1 << 23, svar, dvar) as f64 + / (1 << 23) as f64; // Compare the two versions max_relative_error = @@ -259,8 +258,13 @@ mod ssim_boost_tests { let scale = ((1 << bd) - 1) << (6 - 2 + bd - 8); for svar in scale..(scale << 2) { let float = ((scale << 1) as f64 / svar as f64).cbrt(); - let fixed = - apply_ssim_boost(1 << 23, svar, svar, bd) as f64 / (1 << 23) as f64; + let fixed = match bd { + 8 => apply_ssim_boost::<8>(1 << 23, svar, svar), + 10 => apply_ssim_boost::<10>(1 << 23, svar, svar), + 12 => apply_ssim_boost::<12>(1 << 23, svar, svar), + _ => unimplemented!(), + } as f64 + / (1 << 23) as f64; // Compare the two versions max_relative_error = diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index fbb5ad3e5b..42f3b211a3 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -248,8 +248,15 @@ impl Config { // First-pass parameters depend on whether second-pass is in effect. // So `init_first_pass` must follow `init_second_pass`. if self.rate_control.emit_pass_data { - let maybe_pass1_log_base_q = (self.rate_control.summary.is_none()) - .then(|| inner.rc_state.select_pass1_log_base_q(&inner, 0)); + let maybe_pass1_log_base_q = + (self.rate_control.summary.is_none()).then(|| { + match self.enc.bit_depth { + 8 => inner.rc_state.select_pass1_log_base_q::<_, 8>(&inner, 0), + 10 => inner.rc_state.select_pass1_log_base_q::<_, 10>(&inner, 0), + 12 => inner.rc_state.select_pass1_log_base_q::<_, 12>(&inner, 0), + _ => unimplemented!(), + } + }); inner.rc_state.init_first_pass(maybe_pass1_log_base_q); } diff --git a/src/api/context.rs b/src/api/context.rs index 58d697fd61..9366c53fbb 100644 --- a/src/api/context.rs +++ b/src/api/context.rs @@ -129,7 +129,12 @@ impl Context { } let inner = &mut self.inner; - let run = move || inner.send_frame(frame, params); + let run = move || match inner.config.bit_depth { + 8 => inner.send_frame::<8>(frame, params), + 10 => inner.send_frame::<10>(frame, params), + 12 => inner.send_frame::<12>(frame, params), + _ => unimplemented!(), + }; match &self.pool { Some(pool) => pool.install(run), @@ -302,7 +307,12 @@ impl Context { #[inline] pub fn receive_packet(&mut self) -> Result, EncoderStatus> { let inner = &mut self.inner; - let mut run = move || inner.receive_packet(); + let mut run = move || match inner.config.bit_depth { + 8 => inner.receive_packet::<8>(), + 10 => inner.receive_packet::<10>(), + 12 => inner.receive_packet::<12>(), + _ => unimplemented!(), + }; match &self.pool { Some(pool) => pool.install(run), diff --git a/src/api/internal.rs b/src/api/internal.rs index 1a978de836..5c379d5a86 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -317,7 +317,7 @@ impl ContextInner { } #[hawktracer(send_frame)] - pub fn send_frame( + pub fn send_frame( &mut self, mut frame: Option>>, params: Option, ) -> Result<(), EncoderStatus> { @@ -376,7 +376,7 @@ impl ContextInner { break; } - Self::compute_keyframe_placement( + Self::compute_keyframe_placement::( cur_lookahead_frames, &self.keyframes_forced, &mut self.keyframe_detector, @@ -385,7 +385,7 @@ impl ContextInner { ); } } else { - Self::compute_keyframe_placement( + Self::compute_keyframe_placement::( &lookahead_frames, &self.keyframes_forced, &mut self.keyframe_detector, @@ -395,7 +395,7 @@ impl ContextInner { } } - self.compute_frame_invariants(); + self.compute_frame_invariants::(); Ok(()) } @@ -649,7 +649,9 @@ impl ContextInner { /// function must be called after every new `FrameInvariants` is initially /// computed. #[hawktracer(compute_lookahead_motion_vectors)] - fn compute_lookahead_motion_vectors(&mut self, output_frameno: u64) { + fn compute_lookahead_motion_vectors( + &mut self, output_frameno: u64, + ) { let frame_data = self.frame_data.get(&output_frameno).unwrap(); // We're only interested in valid frames which are not show-existing-frame. @@ -665,7 +667,7 @@ impl ContextInner { let qps = { let fti = frame_data.as_ref().unwrap().fi.get_frame_subtype(); - self.rc_state.select_qi( + self.rc_state.select_qi::<_, BD>( self, output_frameno, fti, @@ -742,14 +744,14 @@ impl ContextInner { fi.rec_buffer = coded_data.lookahead_rec_buffer.clone(); // Estimate lambda with rate-control dry-run - fi.set_quantizers(&qps); + fi.set_quantizers::(&qps); // TODO: as in the encoding code, key frames will have no references. // However, for block importance purposes we want key frames to act as // P-frames in this instance. // // Compute the motion vectors. - compute_motion_vectors(fi, fs, &self.inter_cfg); + compute_motion_vectors::<_, BD>(fi, fs, &self.inter_cfg); let coded_data = fi.coded_frame_data.as_mut().unwrap(); @@ -818,7 +820,9 @@ impl ContextInner { /// Computes lookahead intra cost approximations and fills in /// `lookahead_intra_costs` on the `FrameInvariants`. #[hawktracer(compute_lookahead_intra_costs)] - fn compute_lookahead_intra_costs(&mut self, output_frameno: u64) { + fn compute_lookahead_intra_costs( + &mut self, output_frameno: u64, + ) { let frame_data = self.frame_data.get(&output_frameno).unwrap(); let fd = &frame_data.as_ref(); @@ -853,23 +857,22 @@ impl ContextInner { // We use the cached values from scenechange if available, // otherwise we need to calculate them here. - estimate_intra_costs( + estimate_intra_costs::<_, BD>( temp_plane, &**frame, - fi.sequence.bit_depth, fi.cpu_feature_level, ) }); } #[hawktracer(compute_keyframe_placement)] - pub fn compute_keyframe_placement( + pub fn compute_keyframe_placement( lookahead_frames: &[&Arc>], keyframes_forced: &BTreeSet, keyframe_detector: &mut SceneChangeDetector, next_lookahead_frame: &mut u64, keyframes: &mut BTreeSet, ) { if keyframes_forced.contains(next_lookahead_frame) - || keyframe_detector.analyze_next_frame( + || keyframe_detector.analyze_next_frame::( lookahead_frames, *next_lookahead_frame, *keyframes.iter().last().unwrap(), @@ -882,24 +885,26 @@ impl ContextInner { } #[hawktracer(compute_frame_invariants)] - pub fn compute_frame_invariants(&mut self) { + pub fn compute_frame_invariants(&mut self) { while self.set_frame_properties(self.next_lookahead_output_frameno).is_ok() { - self - .compute_lookahead_motion_vectors(self.next_lookahead_output_frameno); + self.compute_lookahead_motion_vectors::( + self.next_lookahead_output_frameno, + ); if self.config.temporal_rdo() { - self.compute_lookahead_intra_costs(self.next_lookahead_output_frameno); + self.compute_lookahead_intra_costs::( + self.next_lookahead_output_frameno, + ); } self.next_lookahead_output_frameno += 1; } } #[hawktracer(update_block_importances)] - fn update_block_importances( + fn update_block_importances( fi: &FrameInvariants, me_stats: &crate::me::FrameMEStats, - frame: &Frame, reference_frame: &Frame, bit_depth: usize, - bsize: BlockSize, len: usize, - reference_frame_block_importances: &mut [f32], + frame: &Frame, reference_frame: &Frame, bsize: BlockSize, + len: usize, reference_frame_block_importances: &mut [f32], ) { let coded_data = fi.coded_frame_data.as_ref().unwrap(); let plane_org = &frame.planes[0]; @@ -946,12 +951,11 @@ impl ContextInner { height: IMPORTANCE_BLOCK_SIZE, }); - let inter_cost = get_satd( + let inter_cost = get_satd::<_, BD>( ®ion_org, ®ion_ref, bsize.width(), bsize.height(), - bit_depth, fi.cpu_feature_level, ) as f32; @@ -1058,7 +1062,7 @@ impl ContextInner { /// Computes the block importances for the current output frame. #[hawktracer(compute_block_importances)] - fn compute_block_importances(&mut self) { + fn compute_block_importances(&mut self) { // SEF don't need block importances. if self.frame_data[&self.output_frameno] .as_ref() @@ -1142,7 +1146,6 @@ impl ContextInner { } } - let bit_depth = self.config.bit_depth; let frame_data = &mut self.frame_data; let len = unique_indices.len(); @@ -1178,12 +1181,11 @@ impl ContextInner { .block_importances }) { - Self::update_block_importances( + Self::update_block_importances::( fi, me_stats, frame, reference_frame, - bit_depth, bsize, len, reference_frame_block_importances, @@ -1244,7 +1246,7 @@ impl ContextInner { } } - pub(crate) fn encode_packet( + pub(crate) fn encode_packet( &mut self, cur_output_frameno: u64, ) -> Result, EncoderStatus> { if self @@ -1325,10 +1327,9 @@ impl ContextInner { self.frame_q[&frame_data.fi.input_frameno].as_ref().unwrap(); coded_data.activity_mask = ActivityMask::from_plane(&frame.planes[0]); - coded_data.activity_mask.fill_scales( - frame_data.fi.sequence.bit_depth, - &mut coded_data.activity_scales, - ); + coded_data + .activity_mask + .fill_scales::(&mut coded_data.activity_scales); log_isqrt_mean_scale = coded_data.compute_spatiotemporal_scores(); } else { coded_data.activity_mask = ActivityMask::default(); @@ -1359,19 +1360,22 @@ impl ContextInner { } let fti = frame_data.fi.get_frame_subtype(); - let qps = self.rc_state.select_qi( + let qps = self.rc_state.select_qi::<_, BD>( self, cur_output_frameno, fti, self.maybe_prev_log_base_q, log_isqrt_mean_scale, ); - frame_data.fi.set_quantizers(&qps); + frame_data.fi.set_quantizers::(&qps); if self.rc_state.needs_trial_encode(fti) { let mut trial_fs = frame_data.fs.clone(); - let data = - encode_frame(&frame_data.fi, &mut trial_fs, &self.inter_cfg); + let data = encode_frame::<_, BD>( + &frame_data.fi, + &mut trial_fs, + &self.inter_cfg, + ); self.rc_state.update_state( (data.len() * 8) as i64, fti, @@ -1380,18 +1384,21 @@ impl ContextInner { true, false, ); - let qps = self.rc_state.select_qi( + let qps = self.rc_state.select_qi::<_, BD>( self, cur_output_frameno, fti, self.maybe_prev_log_base_q, log_isqrt_mean_scale, ); - frame_data.fi.set_quantizers(&qps); + frame_data.fi.set_quantizers::(&qps); } - let data = - encode_frame(&frame_data.fi, &mut frame_data.fs, &self.inter_cfg); + let data = encode_frame::<_, BD>( + &frame_data.fi, + &mut frame_data.fs, + &self.inter_cfg, + ); #[cfg(feature = "dump_lookahead_data")] { let input_frameno = frame_data.fi.input_frameno; @@ -1488,7 +1495,9 @@ impl ContextInner { } #[hawktracer(receive_packet)] - pub fn receive_packet(&mut self) -> Result, EncoderStatus> { + pub fn receive_packet( + &mut self, + ) -> Result, EncoderStatus> { if self.done_processing() { return Err(EncoderStatus::LimitReached); } @@ -1514,12 +1523,12 @@ impl ContextInner { if self.config.temporal_rdo() { // Compute the block importances for the current output frame. - self.compute_block_importances(); + self.compute_block_importances::(); } let cur_output_frameno = self.output_frameno; - let mut ret = self.encode_packet(cur_output_frameno); + let mut ret = self.encode_packet::(cur_output_frameno); if let Ok(ref mut pkt) = ret { self.garbage_collect(pkt.input_frameno); diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs index 2758d5920b..81a495d7d7 100644 --- a/src/api/lookahead.rs +++ b/src/api/lookahead.rs @@ -27,8 +27,8 @@ pub(crate) const IMP_BLOCK_AREA_IN_MV_UNITS: i64 = IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS; #[hawktracer(estimate_intra_costs)] -pub(crate) fn estimate_intra_costs( - temp_plane: &mut Plane, frame: &Frame, bit_depth: usize, +pub(crate) fn estimate_intra_costs( + temp_plane: &mut Plane, frame: &Frame, cpu_feature_level: CpuFeatureLevel, ) -> Box<[u32]> { let plane = &frame.planes[0]; @@ -54,7 +54,7 @@ pub(crate) fn estimate_intra_costs( }); // TODO: other intra prediction modes. - let edge_buf = get_intra_edges( + let edge_buf = get_intra_edges::<_, BD>( &plane.as_region(), TileBlockOffset(BlockOffset { x, y }), 0, @@ -65,7 +65,6 @@ pub(crate) fn estimate_intra_costs( y: (y * IMPORTANCE_BLOCK_SIZE) as isize, }, TxSize::TX_8X8, - bit_depth, Some(PredictionMode::DC_PRED), false, IntraParam::None, @@ -79,7 +78,7 @@ pub(crate) fn estimate_intra_costs( height: IMPORTANCE_BLOCK_SIZE, }); - PredictionMode::DC_PRED.predict_intra( + PredictionMode::DC_PRED.predict_intra::<_, BD>( TileRect { x: x * IMPORTANCE_BLOCK_SIZE, y: y * IMPORTANCE_BLOCK_SIZE, @@ -88,7 +87,6 @@ pub(crate) fn estimate_intra_costs( }, &mut plane_after_prediction_region, tx_size, - bit_depth, &[], // Not used by DC_PRED IntraParam::None, None, // Not used by DC_PRED @@ -104,12 +102,11 @@ pub(crate) fn estimate_intra_costs( height: IMPORTANCE_BLOCK_SIZE, }); - let intra_cost = get_satd( + let intra_cost = get_satd::<_, BD>( &plane_org, &plane_after_prediction_region, bsize.width(), bsize.height(), - bit_depth, cpu_feature_level, ); @@ -177,9 +174,9 @@ pub(crate) fn estimate_importance_block_difference( } #[hawktracer(estimate_inter_costs)] -pub(crate) fn estimate_inter_costs( - frame: Arc>, ref_frame: Arc>, bit_depth: usize, - mut config: EncoderConfig, sequence: Arc, buffer: RefMEStats, +pub(crate) fn estimate_inter_costs( + frame: Arc>, ref_frame: Arc>, mut config: EncoderConfig, + sequence: Arc, buffer: RefMEStats, ) -> f64 { config.low_latency = true; config.speed_settings.multiref = false; @@ -215,7 +212,7 @@ pub(crate) fn estimate_inter_costs( ], }), ); - compute_motion_vectors(&mut fi, &mut fs, &inter_cfg); + compute_motion_vectors::<_, BD>(&mut fi, &mut fs, &inter_cfg); // Estimate inter costs let plane_org = &frame.planes[0]; @@ -252,12 +249,11 @@ pub(crate) fn estimate_inter_costs( height: IMPORTANCE_BLOCK_SIZE, }); - inter_costs += get_satd( + inter_costs += get_satd::<_, BD>( ®ion_org, ®ion_ref, bsize.width(), bsize.height(), - bit_depth, fi.cpu_feature_level, ) as u64; }); @@ -266,7 +262,7 @@ pub(crate) fn estimate_inter_costs( } #[hawktracer(compute_motion_vectors)] -pub(crate) fn compute_motion_vectors( +pub(crate) fn compute_motion_vectors( fi: &mut FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) { let mut blocks = FrameBlocks::new(fi.w_in_b, fi.h_in_b); @@ -277,6 +273,6 @@ pub(crate) fn compute_motion_vectors( .into_par_iter() .for_each(|mut ctx| { let ts = &mut ctx.ts; - estimate_tile_motion(fi, ts, inter_cfg); + estimate_tile_motion::<_, BD>(fi, ts, inter_cfg); }); } diff --git a/src/api/test.rs b/src/api/test.rs index 0a698ba4d5..12618388fc 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -2274,7 +2274,7 @@ fn min_quantizer_bounds_correctly() { ctx.flush(); for i in 0..limit { - ctx.inner.encode_packet(i).unwrap(); + ctx.inner.encode_packet::<8>(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert_eq!(68, frame_data.fi.base_q_idx); @@ -2305,7 +2305,7 @@ fn min_quantizer_bounds_correctly() { ctx.flush(); for i in 0..limit { - ctx.inner.encode_packet(i).unwrap(); + ctx.inner.encode_packet::<8>(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert!(frame_data.fi.base_q_idx > 68); @@ -2339,7 +2339,7 @@ fn max_quantizer_bounds_correctly() { ctx.flush(); for i in 0..limit { - ctx.inner.encode_packet(i).unwrap(); + ctx.inner.encode_packet::<8>(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert_eq!(95, frame_data.fi.base_q_idx); @@ -2370,7 +2370,7 @@ fn max_quantizer_bounds_correctly() { ctx.flush(); for i in 0..limit { - ctx.inner.encode_packet(i).unwrap(); + ctx.inner.encode_packet::<8>(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert!(frame_data.fi.base_q_idx < 95); diff --git a/src/asm/aarch64/cdef.rs b/src/asm/aarch64/cdef.rs index 2fe70e1248..1c04e50f51 100644 --- a/src/asm/aarch64/cdef.rs +++ b/src/asm/aarch64/cdef.rs @@ -67,8 +67,8 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize { pub(crate) unsafe fn cdef_filter_block( dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize, - pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, - bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel, + pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize, + ydec: usize, edges: u8, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut| { rust::cdef_filter_block( @@ -79,7 +79,6 @@ pub(crate) unsafe fn cdef_filter_block( sec_strength, dir, damping, - bit_depth, xdec, ydec, edges, diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs index 3ef711aca9..dabf3256c3 100644 --- a/src/asm/shared/predict.rs +++ b/src/asm/shared/predict.rs @@ -21,7 +21,6 @@ mod test { #[test] fn pred_matches_u8() { let tx_size = TxSize::TX_4X4; - let bit_depth = 8; let cpu = CpuFeatureLevel::default(); let ac = [0i16; 32 * 32]; // SAFETY: We write to the array below before reading from it. @@ -73,12 +72,11 @@ mod test { for angle in angles { let expected = { let mut plane = Plane::from_slice(&[0u8; 4 * 4], 4); - rust::dispatch_predict_intra( + rust::dispatch_predict_intra::<_, 8>( *mode, *variant, &mut plane.as_region_mut(), tx_size, - bit_depth, &ac, *angle, None, @@ -93,12 +91,11 @@ mod test { }; let mut output = Plane::from_slice(&[0u8; 4 * 4], 4); - dispatch_predict_intra( + dispatch_predict_intra::<_, 8>( *mode, *variant, &mut output.as_region_mut(), tx_size, - bit_depth, &ac, *angle, None, diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs index d34286bec7..94cb328702 100644 --- a/src/asm/shared/transform/inverse.rs +++ b/src/asm/shared/transform/inverse.rs @@ -17,11 +17,11 @@ pub type InvTxfmFunc = pub type InvTxfmHBDFunc = unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32); -pub fn call_inverse_func( +pub fn call_inverse_func( func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, - eob: usize, width: usize, height: usize, bd: usize, + eob: usize, width: usize, height: usize, ) { - debug_assert!(bd == 8); + debug_assert!(BD == 8); // Only use at most 32 columns and 32 rows of input coefficients. let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; @@ -51,7 +51,6 @@ pub fn call_inverse_func( pub fn call_inverse_hbd_func( func: InvTxfmHBDFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize, - _bd: usize, ) { // Only use at most 32 columns and 32 rows of input coefficients. let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; @@ -161,35 +160,32 @@ pub mod test { *d = random::(); *r = i16::from(*s) - i16::from(*d); } - forward_transform( + forward_transform::<_, 8>( res, freq, tx_size.width(), tx_size, tx_type, - 8, CpuFeatureLevel::RUST, ); let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h); let mut rust_dst = dst.clone(); - inverse_transform_add( + inverse_transform_add::<_, 8>( freq, &mut dst.as_region_mut(), eob, tx_size, tx_type, - 8, cpu, ); - inverse_transform_add( + inverse_transform_add::<_, 8>( freq, &mut rust_dst.as_region_mut(), eob, tx_size, tx_type, - 8, CpuFeatureLevel::RUST, ); assert_eq!(rust_dst.data_origin(), dst.data_origin()); diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index 8892429052..50e8801a78 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -41,13 +41,13 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize { ((ydec << 1) | xdec) & 3 } -pub(crate) unsafe fn cdef_filter_block( +pub(crate) unsafe fn cdef_filter_block( dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize, - pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, - bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel, + pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize, + ydec: usize, edges: u8, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut| { - rust::cdef_filter_block( + rust::cdef_filter_block::<_, _, BD>( dst, src, src_stride, @@ -55,7 +55,6 @@ pub(crate) unsafe fn cdef_filter_block( sec_strength, dir, damping, - bit_depth, xdec, ydec, edges, @@ -124,7 +123,7 @@ pub(crate) unsafe fn cdef_filter_block( sec_strength, dir as i32, damping, - (1 << bit_depth) - 1, + (1 << BD) - 1, ); } None => call_rust(dst), @@ -316,7 +315,6 @@ mod test { let pri_strength = 1; let sec_strength = 0; let damping = 2; - let bit_depth = 8; // SAFETY: Calling functions with raw pointers--we created the // planes above and only read from the start. @@ -324,8 +322,8 @@ mod test { // FIXME: Remove `allow` once https://github.com/rust-lang/rust-clippy/issues/8264 fixed #[allow(clippy::undocumented_unsafe_blocks)] unsafe { - cdef_filter_block(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - cdef_filter_block(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST); + cdef_filter_block::<_, 8>(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + cdef_filter_block::<_, 8>(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST); assert_eq!(rust_dst.data_origin(), dst.data_origin()); } } diff --git a/src/asm/x86/dist/cdef_dist.rs b/src/asm/x86/dist/cdef_dist.rs index 6b590d3730..e8119d9d88 100644 --- a/src/asm/x86/dist/cdef_dist.rs +++ b/src/asm/x86/dist/cdef_dist.rs @@ -53,9 +53,9 @@ extern { /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[allow(clippy::let_and_return)] -pub fn cdef_dist_kernel( +pub fn cdef_dist_kernel( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, - bit_depth: usize, cpu: CpuFeatureLevel, + cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(src.plane_cfg.xdec == 0); debug_assert!(src.plane_cfg.ydec == 0); @@ -67,7 +67,7 @@ pub fn cdef_dist_kernel( debug_assert!(h <= 8); let call_rust = - || -> u32 { rust::cdef_dist_kernel(dst, src, w, h, bit_depth, cpu) }; + || -> u32 { rust::cdef_dist_kernel::<_, BD>(dst, src, w, h, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); @@ -112,7 +112,7 @@ pub fn cdef_dist_kernel( } }; - let dist = apply_ssim_boost(sse, svar, dvar, bit_depth); + let dist = apply_ssim_boost::(sse, svar, dvar); #[cfg(feature = "check_asm")] assert_eq!( dist, ref_dist, @@ -315,41 +315,41 @@ pub mod test { #[test] fn cdef_dist_simd_random() { - cdef_diff_tester(8, random_planes::); + cdef_diff_tester::<_, 8>(random_planes::); } #[test] fn cdef_dist_simd_random_hbd() { - cdef_diff_tester(10, random_planes::); - cdef_diff_tester(12, random_planes::); + cdef_diff_tester::<_, 10>(random_planes::); + cdef_diff_tester::<_, 12>(random_planes::); } #[test] fn cdef_dist_simd_large() { - cdef_diff_tester(8, max_planes::); + cdef_diff_tester::<_, 8>(max_planes::); } #[test] fn cdef_dist_simd_large_hbd() { - cdef_diff_tester(10, max_planes::); - cdef_diff_tester(12, max_planes::); + cdef_diff_tester::<_, 10>(max_planes::); + cdef_diff_tester::<_, 12>(max_planes::); } #[test] fn cdef_dist_simd_large_diff() { - cdef_diff_tester(8, max_diff_planes::); + cdef_diff_tester::<_, 8>(max_diff_planes::); } #[test] fn cdef_dist_simd_large_diff_hbd() { - cdef_diff_tester(10, max_diff_planes::); - cdef_diff_tester(12, max_diff_planes::); + cdef_diff_tester::<_, 10>(max_diff_planes::); + cdef_diff_tester::<_, 12>(max_diff_planes::); } - fn cdef_diff_tester( - bd: usize, gen_planes: fn(bd: usize) -> (Plane, Plane), + fn cdef_diff_tester( + gen_planes: fn(bd: usize) -> (Plane, Plane), ) { - let (src_plane, dst_plane) = gen_planes(bd); + let (src_plane, dst_plane) = gen_planes(BD); let mut fail = false; @@ -361,21 +361,19 @@ pub mod test { let src_region = src_plane.region(area); let dst_region = dst_plane.region(area); - let rust = rust::cdef_dist_kernel( + let rust = rust::cdef_dist_kernel::<_, BD>( &src_region, &dst_region, w, h, - bd, CpuFeatureLevel::default(), ); - let simd = cdef_dist_kernel( + let simd = cdef_dist_kernel::<_, BD>( &src_region, &dst_region, w, h, - bd, CpuFeatureLevel::default(), ); diff --git a/src/asm/x86/dist/mod.rs b/src/asm/x86/dist/mod.rs index 676787adf3..08f3381ab0 100644 --- a/src/asm/x86/dist/mod.rs +++ b/src/asm/x86/dist/mod.rs @@ -286,11 +286,11 @@ pub(crate) const fn to_index(bsize: BlockSize) -> usize { #[allow(clippy::let_and_return)] pub fn get_sad( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, - bit_depth: usize, cpu: CpuFeatureLevel, + cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); - let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, bit_depth, cpu) }; + let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); @@ -338,13 +338,13 @@ pub fn get_sad( /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[inline(always)] #[allow(clippy::let_and_return)] -pub fn get_satd( +pub fn get_satd( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, - bit_depth: usize, cpu: CpuFeatureLevel, + cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); - let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, bit_depth, cpu) }; + let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); @@ -374,7 +374,7 @@ pub fn get_satd( T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), - (1 << bit_depth) - 1, + (1 << BD) - 1, ) }, None => call_rust(), @@ -735,12 +735,12 @@ mod test { use rand::random; use std::str::FromStr; - macro_rules! test_dist_fns { - ($(($W:expr, $H:expr)),*, $DIST_TY:ident, $BD:expr, $OPT:ident, $OPTLIT:tt) => { + macro_rules! test_dist_sad_fns { + ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => { $( paste::item! { #[test] - fn []() { + fn []() { if !is_x86_feature_detected!($OPTLIT) { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; @@ -755,8 +755,8 @@ mod test { *s = random::() as u16 * $BD / 8; *d = random::() as u16 * $BD / 8; } - let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); + let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } else { @@ -768,8 +768,8 @@ mod test { *s = random::(); *d = random::(); } - let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); + let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } @@ -779,7 +779,51 @@ mod test { } } - test_dist_fns!( + macro_rules! test_dist_satd_fns { + ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => { + $( + paste::item! { + #[test] + fn []() { + if !is_x86_feature_detected!($OPTLIT) { + eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); + return; + } + + if $BD > 8 { + // dynamic allocation: test + let mut src = Plane::from_slice(&[0u16; $W * $H], $W); + // dynamic allocation: test + let mut dst = Plane::from_slice(&[0u16; $W * $H], $W); + for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { + *s = random::() as u16 * $BD / 8; + *d = random::() as u16 * $BD / 8; + } + let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST); + + assert_eq!(rust_result, result); + } else { + // dynamic allocation: test + let mut src = Plane::from_slice(&[0u8; $W * $H], $W); + // dynamic allocation: test + let mut dst = Plane::from_slice(&[0u8; $W * $H], $W); + for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { + *s = random::(); + *d = random::(); + } + let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST); + + assert_eq!(rust_result, result); + } + } + } + )* + } + } + + test_dist_sad_fns!( (4, 4), (16, 16), (8, 8), @@ -802,13 +846,12 @@ mod test { (128, 64), (16, 64), (64, 16), - sad, - 10, ssse3, - "ssse3" + "ssse3", + 10 ); - test_dist_fns!( + test_dist_sad_fns!( (4, 4), (16, 16), (8, 8), @@ -831,13 +874,12 @@ mod test { (128, 64), (16, 64), (64, 16), - sad, - 10, avx2, - "avx2" + "avx2", + 10 ); - test_dist_fns!( + test_dist_sad_fns!( (4, 4), (4, 8), (4, 16), @@ -849,13 +891,12 @@ mod test { (32, 32), (64, 64), (128, 128), - sad, - 8, sse2, - "sse2" + "sse2", + 8 ); - test_dist_fns!( + test_dist_sad_fns!( (16, 4), (16, 8), (16, 16), @@ -871,17 +912,16 @@ mod test { (64, 128), (128, 64), (128, 128), - sad, - 8, avx2, - "avx2" + "avx2", + 8 ); - test_dist_fns!((8, 8), satd, 8, ssse3, "ssse3"); + test_dist_satd_fns!((8, 8), ssse3, "ssse3", 8); - test_dist_fns!((4, 4), satd, 8, sse4, "sse4.1"); + test_dist_satd_fns!((4, 4), sse4, "sse4.1", 8); - test_dist_fns!( + test_dist_satd_fns!( (4, 4), (8, 8), (16, 16), @@ -904,13 +944,12 @@ mod test { (32, 8), (16, 64), (64, 16), - satd, - 8, avx2, - "avx2" + "avx2", + 8 ); - test_dist_fns!( + test_dist_satd_fns!( (4, 4), (8, 8), (16, 16), @@ -933,13 +972,12 @@ mod test { (32, 8), (16, 64), (64, 16), - satd, - 10, avx2, - "avx2" + "avx2", + 10 ); - test_dist_fns!( + test_dist_satd_fns!( (4, 4), (8, 8), (16, 16), @@ -962,9 +1000,8 @@ mod test { (32, 8), (16, 64), (64, 16), - satd, - 12, avx2, - "avx2" + "avx2", + 12 ); } diff --git a/src/asm/x86/dist/sse.rs b/src/asm/x86/dist/sse.rs index 08c710da11..dd4e6ef7de 100644 --- a/src/asm/x86/dist/sse.rs +++ b/src/asm/x86/dist/sse.rs @@ -92,8 +92,7 @@ declare_asm_hbd_sse_fn![ #[allow(clippy::let_and_return)] pub fn get_weighted_sse( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, scale: &[u32], - scale_stride: usize, w: usize, h: usize, bit_depth: usize, - cpu: CpuFeatureLevel, + scale_stride: usize, w: usize, h: usize, cpu: CpuFeatureLevel, ) -> u64 { // Assembly breaks if imp block size changes. assert_eq!(IMPORTANCE_BLOCK_SIZE >> 1, 4); @@ -101,7 +100,7 @@ pub fn get_weighted_sse( let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u64 { - rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, bit_depth, cpu) + rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, cpu) }; #[cfg(feature = "check_asm")] @@ -381,7 +380,6 @@ pub mod test { SCALE_STRIDE, block.width(), block.height(), - bd, CpuFeatureLevel::default(), ); @@ -392,7 +390,6 @@ pub mod test { SCALE_STRIDE, block.width(), block.height(), - bd, CpuFeatureLevel::default(), ); diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs index 2f5e0b3b8f..19af81e54b 100644 --- a/src/asm/x86/mc.rs +++ b/src/asm/x86/mc.rs @@ -91,15 +91,14 @@ const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize { /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `src` #[inline(always)] -pub fn put_8tap( +pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, - mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel, + mode_y: FilterMode, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { - rust::put_8tap( - dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, - cpu, + rust::put_8tap::<_, BD>( + dst, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu, ); }; #[cfg(feature = "check_asm")] @@ -149,7 +148,7 @@ pub fn put_8tap( height as i32, col_frac, row_frac, - (1 << bit_depth) - 1, + (1 << BD) - 1, ), None => call_rust(dst), } @@ -176,15 +175,14 @@ pub fn put_8tap( /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `src` #[inline(always)] -pub fn prep_8tap( +pub fn prep_8tap( tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, - bit_depth: usize, cpu: CpuFeatureLevel, + cpu: CpuFeatureLevel, ) { let call_rust = |tmp: &mut [i16]| { - rust::prep_8tap( - tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, - cpu, + rust::prep_8tap::<_, BD>( + tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu, ); }; #[cfg(feature = "check_asm")] @@ -223,7 +221,7 @@ pub fn prep_8tap( None => call_rust(tmp), } } - PixelType::U16 if bit_depth > 8 => { + PixelType::U16 if BD > 8 => { match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( tmp.as_mut_ptr() as *mut _, @@ -233,7 +231,7 @@ pub fn prep_8tap( height as i32, col_frac, row_frac, - (1 << bit_depth) - 1, + (1 << BD) - 1, ), None => call_rust(tmp), } @@ -253,12 +251,12 @@ pub fn prep_8tap( /// - If `width` is not between 2 and 128 /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `dst` -pub fn mc_avg( +pub fn mc_avg( dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, - height: usize, bit_depth: usize, cpu: CpuFeatureLevel, + height: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { - rust::mc_avg(dst, tmp1, tmp2, width, height, bit_depth, cpu); + rust::mc_avg::<_, BD>(dst, tmp1, tmp2, width, height, cpu); }; #[cfg(feature = "check_asm")] let ref_dst = { @@ -292,7 +290,7 @@ pub fn mc_avg( ), None => call_rust(dst), }, - PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] { + PixelType::U16 if BD > 8 => match AVG_HBD_FNS[cpu.as_index()] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), @@ -300,7 +298,7 @@ pub fn mc_avg( tmp2.as_ptr(), width as i32, height as i32, - (1 << bit_depth) - 1, + (1 << BD) - 1, ), None => call_rust(dst), }, @@ -652,8 +650,8 @@ mod test { for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); - super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); + super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST); assert_eq!(&*dst1.data, &*dst2.data); } @@ -670,8 +668,8 @@ mod test { for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); - super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); + super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST); assert_eq!(&*dst1.data, &*dst2.data); } @@ -740,8 +738,8 @@ mod test { for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); - super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); + super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST); } } else { // dynamic allocation: test @@ -752,8 +750,8 @@ mod test { for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); - super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); - super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); + super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap()); + super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST); } }; diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index e001ae98ef..67cec03afe 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -194,16 +194,15 @@ decl_cfl_pred_hbd_fn! { } #[inline(always)] -pub fn dispatch_predict_intra( +pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, - dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, - ac: &[i16], angle: isize, ief_params: Option, + dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16], angle: isize, + ief_params: Option, edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { - rust::dispatch_predict_intra( - mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf, - cpu, + rust::dispatch_predict_intra::( + mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu, ); }; @@ -362,11 +361,11 @@ pub fn dispatch_predict_intra( } } } - PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && bit_depth > 8 => { + PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && BD > 8 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _; - let bd_max = (1 << bit_depth) - 1; + let bd_max = (1 << BD) - 1; match mode { PredictionMode::DC_PRED => { (match variant { diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 28dbabedc7..3228902391 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -24,7 +24,6 @@ type DequantizeFn = unsafe fn( _eob: usize, rcoeffs_ptr: *mut i16, tx_size: TxSize, - bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, ); @@ -32,18 +31,17 @@ type DequantizeFn = unsafe fn( cpu_function_lookup_table!( DEQUANTIZE_FNS: [Option], default: None, - [(AVX2, Some(dequantize_avx2))] + [(AVX2, Some(dequantize_avx2_8bpc))] ); #[inline(always)] -pub fn dequantize( +pub fn dequantize( qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize, - bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel, + dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel, ) { let call_rust = |rcoeffs: &mut [T]| { - crate::quantize::rust::dequantize( - qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q, - ac_delta_q, cpu, + crate::quantize::rust::dequantize::<_, BD>( + qindex, coeffs, eob, rcoeffs, tx_size, dc_delta_q, ac_delta_q, cpu, ); }; @@ -67,7 +65,6 @@ pub fn dequantize( eob, rcoeffs.as_mut_ptr() as *mut _, tx_size, - bit_depth, dc_delta_q, ac_delta_q, ) @@ -87,18 +84,19 @@ pub fn dequantize( } #[target_feature(enable = "avx2")] -unsafe fn dequantize_avx2( +unsafe fn dequantize_avx2_8bpc( qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16, - tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, + tx_size: TxSize, dc_delta_q: i8, ac_delta_q: i8, ) { + const BD: usize = 8; let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32); let quants_ac = - _mm256_set1_epi32(ac_q(qindex, ac_delta_q, bit_depth).get() as i32); + _mm256_set1_epi32(ac_q::(qindex, ac_delta_q).get() as i32); // Use the dc quantize as first vector element for the first iteration let mut quants = _mm256_insert_epi32( quants_ac, - dc_q(qindex, dc_delta_q, bit_depth).get() as i32, + dc_q::(qindex, dc_delta_q).get() as i32, 0, ); @@ -169,12 +167,10 @@ mod test { TX_8X32, TX_32X8, TX_16X64, TX_64X16, ]; - let bd: usize = 8; - for &tx_size in &tx_sizes { let qindex: u8 = rng.gen_range((MINQ as u8)..(MAXQ as u8)); - let dc_quant = dc_q(qindex, 0, bd).get() as i16; - let ac_quant = ac_q(qindex, 0, bd).get() as i16; + let dc_quant = dc_q::<8>(qindex, 0).get() as i16; + let ac_quant = ac_q::<8>(qindex, 0).get() as i16; // Test the min, max, and random eobs let eobs = { @@ -200,13 +196,12 @@ mod test { } // Rely on quantize's internal tests - dequantize( + dequantize::<_, 8>( qindex, &qcoeffs.data, eob, &mut rcoeffs.data, tx_size, - bd, 0, 0, CpuFeatureLevel::default(), diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs index 18b1171517..d341915a0d 100644 --- a/src/asm/x86/transform/forward.rs +++ b/src/asm/x86/transform/forward.rs @@ -332,9 +332,9 @@ fn cast_mut(x: &mut [T]) -> &mut [T; N] { #[allow(clippy::identity_op, clippy::erasing_op)] #[target_feature(enable = "avx2")] -unsafe fn forward_transform_avx2( +unsafe fn forward_transform_avx2( input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, + tx_type: TxType, ) { // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to @@ -350,7 +350,7 @@ unsafe fn forward_transform_avx2( let mut tmp: Aligned<[I32X8; 64 * 64 / 8]> = Aligned::uninitialized(); let buf = &mut tmp.data[..txfm_size_col * (txfm_size_row / 8).max(1)]; - let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); + let cfg = Txfm2DFlipCfg::fwd::(tx_type, tx_size); let txfm_func_col = get_func_i32x8(cfg.txfm_type_col); let txfm_func_row = get_func_i32x8(cfg.txfm_type_row); @@ -507,18 +507,20 @@ unsafe fn forward_transform_avx2( /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` -pub fn forward_transform( +pub fn forward_transform( input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, + tx_type: TxType, cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); if cpu >= CpuFeatureLevel::AVX2 { // SAFETY: Calls Assembly code. unsafe { - forward_transform_avx2(input, output, stride, tx_size, tx_type, bd); + forward_transform_avx2::<_, BD>(input, output, stride, tx_size, tx_type); } } else { - rust::forward_transform(input, output, stride, tx_size, tx_type, bd, cpu); + rust::forward_transform::<_, BD>( + input, output, stride, tx_size, tx_type, cpu, + ); } } @@ -562,22 +564,20 @@ mod test { let mut output_simd = vec![0i16; area]; println!("Testing combination {:?}, {:?}", tx_size, tx_type); - forward_transform( + forward_transform::<_, 8>( &input[..], &mut output_ref[..], tx_size.width(), tx_size, tx_type, - 8, CpuFeatureLevel::RUST, ); - forward_transform( + forward_transform::<_, 8>( &input[..], &mut output_simd[..], tx_size.width(), tx_size, tx_type, - 8, cpu, ); assert_eq!(output_ref, output_simd) diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index 027cdf19b7..84b728d5e0 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -16,27 +16,26 @@ use crate::{Pixel, PixelType}; use crate::asm::shared::transform::inverse::*; use crate::asm::shared::transform::*; -pub fn inverse_transform_add( +pub fn inverse_transform_add( input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize, - tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, + tx_size: TxSize, tx_type: TxType, cpu: CpuFeatureLevel, ) { match T::type_enum() { PixelType::U8 => { if let Some(func) = INV_TXFM_FNS[cpu.as_index()] [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)] { - return call_inverse_func( + return call_inverse_func::<_, BD>( func, input, output, eob, tx_size.width(), tx_size.height(), - bd, ); } } - PixelType::U16 if bd == 10 => { + PixelType::U16 if BD == 10 => { if let Some(func) = INV_TXFM_HBD_FNS_10[cpu.as_index()] [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)] { @@ -47,11 +46,10 @@ pub fn inverse_transform_add( eob, tx_size.width(), tx_size.height(), - bd, ); } } - PixelType::U16 => { + PixelType::U16 if BD == 12 => { if let Some(func) = INV_TXFM_HBD_FNS_12[cpu.as_index()] [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)] { @@ -62,13 +60,15 @@ pub fn inverse_transform_add( eob, tx_size.width(), tx_size.height(), - bd, ); } } + _ => unimplemented!(), }; - rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu); + rust::inverse_transform_add::<_, BD>( + input, output, eob, tx_size, tx_type, cpu, + ); } macro_rules! decl_itx_fns { diff --git a/src/cdef.rs b/src/cdef.rs index 863399036f..71fc1ae2d1 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -196,11 +196,14 @@ pub(crate) mod rust { #[cold_for_target_arch("x86_64")] #[allow(clippy::erasing_op, clippy::identity_op, clippy::neg_multiply)] - pub(crate) unsafe fn cdef_filter_block( + pub(crate) unsafe fn cdef_filter_block< + T: Pixel, + U: Pixel, + const BD: usize, + >( dst: &mut PlaneRegionMut<'_, T>, input: *const U, istride: isize, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, - bit_depth: usize, xdec: usize, ydec: usize, edges: u8, - _cpu: CpuFeatureLevel, + xdec: usize, ydec: usize, edges: u8, _cpu: CpuFeatureLevel, ) { if edges != CDEF_HAVE_ALL { // slowpath for unpadded border[s] @@ -216,7 +219,7 @@ pub(crate) mod rust { 8 >> ydec, edges, ); - cdef_filter_block( + cdef_filter_block::<_, _, BD>( dst, tmp.as_ptr().offset(2 * tmpstride + 2), tmpstride, @@ -224,7 +227,6 @@ pub(crate) mod rust { sec_strength, dir, damping, - bit_depth, xdec, ydec, CDEF_HAVE_ALL, @@ -233,7 +235,7 @@ pub(crate) mod rust { } else { let xsize = (8 >> xdec) as isize; let ysize = (8 >> ydec) as isize; - let coeff_shift = bit_depth - 8; + let coeff_shift = BD - 8; let cdef_pri_taps = [[4, 2], [3, 3]]; let cdef_sec_taps = [[2, 1], [2, 1]]; let pri_taps = @@ -322,7 +324,7 @@ fn adjust_strength(strength: i32, var: i32) -> i32 { } } -pub fn cdef_analyze_superblock_range( +pub fn cdef_analyze_superblock_range( fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, sb_w: usize, sb_h: usize, ) -> Vec { @@ -330,17 +332,17 @@ pub fn cdef_analyze_superblock_range( for sby in 0..sb_h { for sbx in 0..sb_w { let sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); - ret.push(cdef_analyze_superblock(fi, in_frame, blocks, sbo)); + ret.push(cdef_analyze_superblock::<_, BD>(fi, in_frame, blocks, sbo)); } } ret } -pub fn cdef_analyze_superblock( +pub fn cdef_analyze_superblock( fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, sbo: TileSuperBlockOffset, ) -> CdefDirections { - let coeff_shift = fi.sequence.bit_depth - 8; + let coeff_shift = BD - 8; let mut dir: CdefDirections = CdefDirections { dir: [[0; 8]; 8], var: [[0; 8]; 8] }; // Each direction block is 8x8 in y, and direction computation only looks at y @@ -396,13 +398,12 @@ pub fn cdef_analyze_superblock( /// # Panics /// /// - If called with invalid parameters -pub fn cdef_filter_superblock( +pub fn cdef_filter_superblock( fi: &FrameInvariants, input: &Frame, output: &mut TileMut<'_, T>, blocks: &TileBlocks<'_>, tile_sbo: TileSuperBlockOffset, cdef_index: u8, cdef_dirs: &CdefDirections, ) { - let bit_depth = fi.sequence.bit_depth; - let coeff_shift = fi.sequence.bit_depth as i32 - 8; + let coeff_shift = BD as i32 - 8; let cdef_damping = fi.cdef_damping as i32; let cdef_y_strength = fi.cdef_y_strengths[cdef_index as usize]; let cdef_uv_strength = fi.cdef_uv_strengths[cdef_index as usize]; @@ -536,7 +537,7 @@ pub fn cdef_filter_superblock( 0 <= in_po.y - if edges & CDEF_HAVE_TOP > 0 { 2 } else { 0 } ); - cdef_filter_block( + cdef_filter_block::<_, BD>( out_block, in_slice.as_ptr(), in_stride as isize, @@ -544,7 +545,6 @@ pub fn cdef_filter_superblock( local_sec_strength, local_dir, local_damping, - bit_depth, xdec, ydec, edges, @@ -592,7 +592,7 @@ pub fn cdef_filter_superblock( // don't exist. #[hawktracer(cdef_filter_tile)] -pub fn cdef_filter_tile( +pub fn cdef_filter_tile( fi: &FrameInvariants, input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>, ) { @@ -613,9 +613,10 @@ pub fn cdef_filter_tile( // the input Frame. let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby }); let cdef_index = tb.get_cdef(tile_sbo); - let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); + let cdef_dirs = + cdef_analyze_superblock::<_, BD>(fi, input, tb, tile_sbo); - cdef_filter_superblock( + cdef_filter_superblock::<_, BD>( fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, ); } diff --git a/src/deblock.rs b/src/deblock.rs index 21a4bf19aa..43afbfb468 100644 --- a/src/deblock.rs +++ b/src/deblock.rs @@ -146,9 +146,10 @@ fn deblock_level( // four taps, 4 outputs (two are trivial) #[inline] -fn filter_narrow2_4( - p1: i32, p0: i32, q0: i32, q1: i32, shift: usize, +fn filter_narrow2_4( + p1: i32, p0: i32, q0: i32, q1: i32, ) -> [i32; 4] { + let shift = BD - 8; let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1); let filter1 = clamp(filter0 + 3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; @@ -178,28 +179,29 @@ fn filter_narrow2_4( // six taps, 6 outputs (four are trivial) #[inline] -fn filter_narrow2_6( - p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, +fn filter_narrow2_6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, ) -> [i32; 6] { - let x = filter_narrow2_4(p1, p0, q0, q1, shift); + let x = filter_narrow2_4::(p1, p0, q0, q1); [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (ten are trivial) #[inline] -fn filter_narrow2_12( +fn filter_narrow2_12( p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, - q2: i32, q3: i32, q4: i32, q5: i32, shift: usize, + q2: i32, q3: i32, q4: i32, q5: i32, ) -> [i32; 12] { - let x = filter_narrow2_4(p1, p0, q0, q1, shift); + let x = filter_narrow2_4::(p1, p0, q0, q1); [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } // four taps, 4 outputs #[inline] -fn filter_narrow4_4( - p1: i32, p0: i32, q0: i32, q1: i32, shift: usize, +fn filter_narrow4_4( + p1: i32, p0: i32, q0: i32, q1: i32, ) -> [i32; 4] { + let shift = BD - 8; let filter1 = clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; // be certain our optimization removing a clamp is sound @@ -227,20 +229,20 @@ fn filter_narrow4_4( // six taps, 6 outputs (two are trivial) #[inline] -fn filter_narrow4_6( - p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, +fn filter_narrow4_6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, ) -> [i32; 6] { - let x = filter_narrow4_4(p1, p0, q0, q1, shift); + let x = filter_narrow4_4::(p1, p0, q0, q1); [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (eight are trivial) #[inline] -fn filter_narrow4_12( +fn filter_narrow4_12( p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, - q2: i32, q3: i32, q4: i32, q5: i32, shift: usize, + q2: i32, q3: i32, q4: i32, q5: i32, ) -> [i32; 12] { - let x = filter_narrow4_4(p1, p0, q0, q1, shift); + let x = filter_narrow4_4::(p1, p0, q0, q1); [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } @@ -333,57 +335,63 @@ fn stride_sse(a: &[i32; LEN], b: &[i32; LEN]) -> i64 { } #[inline] -const fn _level_to_limit(level: i32, shift: usize) -> i32 { +const fn _level_to_limit(level: i32) -> i32 { + let shift = BD - 8; level << shift } #[inline] -const fn limit_to_level(limit: i32, shift: usize) -> i32 { +const fn limit_to_level(limit: i32) -> i32 { + let shift = BD - 8; (limit + (1 << shift) - 1) >> shift } #[inline] -const fn _level_to_blimit(level: i32, shift: usize) -> i32 { +const fn _level_to_blimit(level: i32) -> i32 { + let shift = BD - 8; (3 * level + 4) << shift } #[inline] -const fn blimit_to_level(blimit: i32, shift: usize) -> i32 { +const fn blimit_to_level(blimit: i32) -> i32 { + let shift = BD - 8; (((blimit + (1 << shift) - 1) >> shift) - 2) / 3 } #[inline] -const fn _level_to_thresh(level: i32, shift: usize) -> i32 { +const fn _level_to_thresh(level: i32) -> i32 { + let shift = BD - 8; level >> 4 << shift } #[inline] -const fn thresh_to_level(thresh: i32, shift: usize) -> i32 { +const fn thresh_to_level(thresh: i32) -> i32 { + let shift = BD - 8; (thresh + (1 << shift) - 1) >> shift << 4 } #[inline] -fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { - thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize +fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32) -> usize { + thresh_to_level::(cmp::max((p1 - p0).abs(), (q1 - q0).abs())) as usize } #[inline] -fn mask4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { +fn mask4(p1: i32, p0: i32, q0: i32, q1: i32) -> usize { cmp::max( - limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), + limit_to_level::(cmp::max((p1 - p0).abs(), (q1 - q0).abs())), + blimit_to_level::((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2), ) as usize } #[inline] -fn deblock_size4_inner( - [p1, p0, q0, q1]: [i32; 4], level: usize, bd: usize, +fn deblock_size4_inner( + [p1, p0, q0, q1]: [i32; 4], level: usize, ) -> Option<[i32; 4]> { - if mask4(p1, p0, q0, q1, bd - 8) <= level { - let x = if nhev4(p1, p0, q0, q1, bd - 8) <= level { - filter_narrow4_4(p1, p0, q0, q1, bd - 8) + if mask4::(p1, p0, q0, q1) <= level { + let x = if nhev4::(p1, p0, q0, q1) <= level { + filter_narrow4_4::(p1, p0, q0, q1) } else { - filter_narrow2_4(p1, p0, q0, q1, bd - 8) + filter_narrow2_4::(p1, p0, q0, q1) }; Some(x) } else { @@ -392,26 +400,26 @@ fn deblock_size4_inner( } // Assumes rec[0] is set 2 taps back from the edge -fn deblock_v_size4( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_v_size4( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_()]; - if let Some(data) = deblock_size4_inner(vals, level, bd) { + if let Some(data) = deblock_size4_inner::(vals, level) { copy_horizontal(rec, 0, y, &data); } } } // Assumes rec[0] is set 2 taps back from the edge -fn deblock_h_size4( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_h_size4( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for x in 0..4 { let vals = [rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_()]; - if let Some(data) = deblock_size4_inner(vals, level, bd) { + if let Some(data) = deblock_size4_inner::(vals, level) { copy_vertical(rec, x, 0, &data); } } @@ -419,9 +427,9 @@ fn deblock_h_size4( // Assumes rec[0] and src[0] are set 2 taps back from the edge. // Accesses four taps, accumulates four pixels into the tally -fn sse_size4( +fn sse_size4( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, - tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, + tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, ) { for i in 0..4 { let (p1, p0, q0, q1, a) = if horizontal_p { @@ -445,13 +453,13 @@ fn sse_size4( // three possibilities: no filter, narrow2 and narrow4 // All possibilities produce four outputs let none: [_; 4] = [p1, p0, q0, q1]; - let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + let narrow2 = filter_narrow2_4::(p1, p0, q0, q1); + let narrow4 = filter_narrow4_4::(p1, p0, q0, q1); // mask4 sets the dividing line for filter vs no filter // nhev4 sets the dividing line between narrow2 and narrow4 - let mask = clamp(mask4(p1, p0, q0, q1, bd - 8), 1, MAX_LOOP_FILTER + 1); - let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); + let mask = clamp(mask4::(p1, p0, q0, q1), 1, MAX_LOOP_FILTER + 1); + let nhev = clamp(nhev4::(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); @@ -474,18 +482,15 @@ fn sse_size4( } #[inline] -fn mask6( - p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, +fn mask6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, ) -> usize { cmp::max( - limit_to_level( - cmp::max( - (p2 - p1).abs(), - cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())), - ), - shift, - ), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), + limit_to_level::(cmp::max( + (p2 - p1).abs(), + cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())), + )), + blimit_to_level::((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2), ) as usize } @@ -498,17 +503,17 @@ fn flat6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> usize { } #[inline] -fn deblock_size6_inner( - [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize, bd: usize, +fn deblock_size6_inner( + [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize, ) -> Option<[i32; 4]> { - if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level { - let flat = 1 << (bd - 8); + if mask6::(p2, p1, p0, q0, q1, q2) <= level { + let flat = 1 << (BD - 8); let x = if flat6(p2, p1, p0, q0, q1, q2) <= flat { filter_wide6_4(p2, p1, p0, q0, q1, q2) - } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { - filter_narrow4_4(p1, p0, q0, q1, bd - 8) + } else if nhev4::(p1, p0, q0, q1) <= level { + filter_narrow4_4::(p1, p0, q0, q1) } else { - filter_narrow2_4(p1, p0, q0, q1, bd - 8) + filter_narrow2_4::(p1, p0, q0, q1) }; Some(x) } else { @@ -517,22 +522,22 @@ fn deblock_size6_inner( } // Assumes slice[0] is set 3 taps back from the edge -fn deblock_v_size6( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_v_size6( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_()]; - if let Some(data) = deblock_size6_inner(vals, level, bd) { + if let Some(data) = deblock_size6_inner::(vals, level) { copy_horizontal(rec, 1, y, &data); } } } // Assumes slice[0] is set 3 taps back from the edge -fn deblock_h_size6( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_h_size6( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for x in 0..4 { let vals = [ @@ -543,7 +548,7 @@ fn deblock_h_size6( rec[4][x].as_(), rec[5][x].as_(), ]; - if let Some(data) = deblock_size6_inner(vals, level, bd) { + if let Some(data) = deblock_size6_inner::(vals, level) { copy_vertical(rec, x, 1, &data); } } @@ -551,11 +556,11 @@ fn deblock_h_size6( // Assumes rec[0] and src[0] are set 3 taps back from the edge. // Accesses six taps, accumulates four pixels into the tally -fn sse_size6( +fn sse_size6( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, - tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, + tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, ) { - let flat = 1 << (bd - 8); + let flat = 1 << (BD - 8); for i in 0..4 { let (p2, p1, p0, q0, q1, q2, a) = if horizontal_p { // six taps @@ -587,16 +592,16 @@ fn sse_size6( // All possibilities produce four outputs let none: [_; 4] = [p1, p0, q0, q1]; let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2); - let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + let narrow2 = filter_narrow2_4::(p1, p0, q0, q1); + let narrow4 = filter_narrow4_4::(p1, p0, q0, q1); // mask6 sets the dividing line for filter vs no filter // flat6 decides between wide and narrow filters (unrelated to level) // nhev4 sets the dividing line between narrow2 and narrow4 let mask = - clamp(mask6(p2, p1, p0, q0, q1, q2, bd - 8), 1, MAX_LOOP_FILTER + 1); + clamp(mask6::(p2, p1, p0, q0, q1, q2), 1, MAX_LOOP_FILTER + 1); let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); + let nhev = clamp(nhev4::(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); @@ -627,28 +632,24 @@ fn sse_size6( } #[inline] -fn mask8( +fn mask8( p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, - shift: usize, ) -> usize { cmp::max( - limit_to_level( + limit_to_level::(cmp::max( + (p3 - p2).abs(), cmp::max( - (p3 - p2).abs(), + (p2 - p1).abs(), cmp::max( - (p2 - p1).abs(), + (p1 - p0).abs(), cmp::max( - (p1 - p0).abs(), - cmp::max( - (q3 - q2).abs(), - cmp::max((q2 - q1).abs(), (q1 - q0).abs()), - ), + (q3 - q2).abs(), + cmp::max((q2 - q1).abs(), (q1 - q0).abs()), ), ), ), - shift, - ), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), + )), + blimit_to_level::((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2), ) as usize } @@ -669,17 +670,17 @@ fn flat8( } #[inline] -fn deblock_size8_inner( - [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize, bd: usize, +fn deblock_size8_inner( + [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize, ) -> Option<[i32; 6]> { - if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { - let flat = 1 << (bd - 8); + if mask8::(p3, p2, p1, p0, q0, q1, q2, q3) <= level { + let flat = 1 << (BD - 8); let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3) - } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { - filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8) + } else if nhev4::(p1, p0, q0, q1) <= level { + filter_narrow4_6::(p2, p1, p0, q0, q1, q2) } else { - filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8) + filter_narrow2_6::(p2, p1, p0, q0, q1, q2) }; Some(x) } else { @@ -688,8 +689,8 @@ fn deblock_size8_inner( } // Assumes rec[0] is set 4 taps back from the edge -fn deblock_v_size8( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_v_size8( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for y in 0..4 { let p = &rec[y]; @@ -703,15 +704,15 @@ fn deblock_v_size8( p[6].as_(), p[7].as_(), ]; - if let Some(data) = deblock_size8_inner(vals, level, bd) { + if let Some(data) = deblock_size8_inner::(vals, level) { copy_horizontal(rec, 1, y, &data); } } } // Assumes rec[0] is set 4 taps back from the edge -fn deblock_h_size8( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_h_size8( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for x in 0..4 { let vals = [ @@ -724,7 +725,7 @@ fn deblock_h_size8( rec[6][x].as_(), rec[7][x].as_(), ]; - if let Some(data) = deblock_size8_inner(vals, level, bd) { + if let Some(data) = deblock_size8_inner::(vals, level) { copy_vertical(rec, x, 1, &data); } } @@ -732,11 +733,11 @@ fn deblock_h_size8( // Assumes rec[0] and src[0] are set 4 taps back from the edge. // Accesses eight taps, accumulates six pixels into the tally -fn sse_size8( +fn sse_size8( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, - tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, + tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, ) { - let flat = 1 << (bd - 8); + let flat = 1 << (BD - 8); for i in 0..4 { let (p3, p2, p1, p0, q0, q1, q2, q3, a) = if horizontal_p { @@ -786,19 +787,19 @@ fn sse_size8( // Four possibilities: no filter, wide8, narrow2 and narrow4 let none: [_; 6] = [p2, p1, p0, q0, q1, q2]; let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); - let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); - let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); + let narrow2: [_; 6] = filter_narrow2_6::(p2, p1, p0, q0, q1, q2); + let narrow4: [_; 6] = filter_narrow4_6::(p2, p1, p0, q0, q1, q2); // mask8 sets the dividing line for filter vs no filter // flat8 decides between wide and narrow filters (unrelated to level) // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp( - mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), + mask8::(p3, p2, p1, p0, q0, q1, q2, q3), 1, MAX_LOOP_FILTER + 1, ); let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); + let nhev = clamp(nhev4::(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); @@ -845,13 +846,13 @@ fn flat14_outer( } #[inline] -fn deblock_size14_inner( +fn deblock_size14_inner( [p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6]: [i32; 14], - level: usize, bd: usize, + level: usize, ) -> Option<[i32; 12]> { // 'mask' test - if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { - let flat = 1 << (bd - 8); + if mask8::(p3, p2, p1, p0, q0, q1, q2, q3) <= level { + let flat = 1 << (BD - 8); // inner flatness test let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { // outer flatness test @@ -864,11 +865,11 @@ fn deblock_size14_inner( // only flat in inner area, run 8-tap filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5) } - } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { + } else if nhev4::(p1, p0, q0, q1) <= level { // not flat, run narrow filter - filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8) + filter_narrow4_12::(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5) } else { - filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8) + filter_narrow2_12::(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5) }; Some(x) } else { @@ -877,8 +878,8 @@ fn deblock_size14_inner( } // Assumes rec[0] is set 7 taps back from the edge -fn deblock_v_size14( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_v_size14( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for y in 0..4 { let p = &rec[y]; @@ -898,15 +899,15 @@ fn deblock_v_size14( p[12].as_(), p[13].as_(), ]; - if let Some(data) = deblock_size14_inner(vals, level, bd) { + if let Some(data) = deblock_size14_inner::(vals, level) { copy_horizontal(rec, 1, y, &data); } } } // Assumes rec[0] is set 7 taps back from the edge -fn deblock_h_size14( - rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, +fn deblock_h_size14( + rec: &mut PlaneRegionMut<'_, T>, level: usize, ) { for x in 0..4 { let vals = [ @@ -925,7 +926,7 @@ fn deblock_h_size14( rec[12][x].as_(), rec[13][x].as_(), ]; - if let Some(data) = deblock_size14_inner(vals, level, bd) { + if let Some(data) = deblock_size14_inner::(vals, level) { copy_vertical(rec, x, 1, &data); } } @@ -933,11 +934,11 @@ fn deblock_h_size14( // Assumes rec[0] and src[0] are set 7 taps back from the edge. // Accesses fourteen taps, accumulates twelve pixels into the tally -fn sse_size14( +fn sse_size14( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, - tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, + tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, ) { - let flat = 1 << (bd - 8); + let flat = 1 << (BD - 8); for i in 0..4 { let (p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, a) = if horizontal_p { @@ -1014,49 +1015,23 @@ fn sse_size14( filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6); let wide8 = filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); - let narrow2 = filter_narrow2_12( - p5, - p4, - p3, - p2, - p1, - p0, - q0, - q1, - q2, - q3, - q4, - q5, - bd - 8, - ); - let narrow4 = filter_narrow4_12( - p5, - p4, - p3, - p2, - p1, - p0, - q0, - q1, - q2, - q3, - q4, - q5, - bd - 8, - ); + let narrow2 = + filter_narrow2_12::(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); + let narrow4 = + filter_narrow4_12::(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); // mask8 sets the dividing line for filter vs no filter // flat8 decides between wide and narrow filters (unrelated to level) // flat14 decides between wide14 and wide8 filters // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp( - mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), + mask8::(p3, p2, p1, p0, q0, q1, q2, q3), 1, MAX_LOOP_FILTER + 1, ); let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); + let nhev = clamp(nhev4::(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); @@ -1098,9 +1073,9 @@ fn sse_size14( } } -fn filter_v_edge( +fn filter_v_edge( deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset, - p: &mut PlaneRegionMut, pli: usize, bd: usize, xdec: usize, ydec: usize, + p: &mut PlaneRegionMut, pli: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { @@ -1126,16 +1101,16 @@ fn filter_v_edge( }); match filter_size { 4 => { - deblock_v_size4(&mut plane_region, level, bd); + deblock_v_size4::<_, BD>(&mut plane_region, level); } 6 => { - deblock_v_size6(&mut plane_region, level, bd); + deblock_v_size6::<_, BD>(&mut plane_region, level); } 8 => { - deblock_v_size8(&mut plane_region, level, bd); + deblock_v_size8::<_, BD>(&mut plane_region, level); } 14 => { - deblock_v_size14(&mut plane_region, level, bd); + deblock_v_size14::<_, BD>(&mut plane_region, level); } _ => unreachable!(), } @@ -1144,10 +1119,10 @@ fn filter_v_edge( } } -fn sse_v_edge( +fn sse_v_edge( blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], - pli: usize, bd: usize, xdec: usize, ydec: usize, + pli: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { @@ -1177,16 +1152,16 @@ fn sse_v_edge( }); match filter_size { 4 => { - sse_size4(&rec_region, &src_region, tally, false, bd); + sse_size4::<_, BD>(&rec_region, &src_region, tally, false); } 6 => { - sse_size6(&rec_region, &src_region, tally, false, bd); + sse_size6::<_, BD>(&rec_region, &src_region, tally, false); } 8 => { - sse_size8(&rec_region, &src_region, tally, false, bd); + sse_size8::<_, BD>(&rec_region, &src_region, tally, false); } 14 => { - sse_size14(&rec_region, &src_region, tally, false, bd); + sse_size14::<_, BD>(&rec_region, &src_region, tally, false); } _ => unreachable!(), } @@ -1194,9 +1169,9 @@ fn sse_v_edge( } } -fn filter_h_edge( +fn filter_h_edge( deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset, - p: &mut PlaneRegionMut, pli: usize, bd: usize, xdec: usize, ydec: usize, + p: &mut PlaneRegionMut, pli: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { @@ -1222,16 +1197,16 @@ fn filter_h_edge( }); match filter_size { 4 => { - deblock_h_size4(&mut plane_region, level, bd); + deblock_h_size4::<_, BD>(&mut plane_region, level); } 6 => { - deblock_h_size6(&mut plane_region, level, bd); + deblock_h_size6::<_, BD>(&mut plane_region, level); } 8 => { - deblock_h_size8(&mut plane_region, level, bd); + deblock_h_size8::<_, BD>(&mut plane_region, level); } 14 => { - deblock_h_size14(&mut plane_region, level, bd); + deblock_h_size14::<_, BD>(&mut plane_region, level); } _ => unreachable!(), } @@ -1240,10 +1215,10 @@ fn filter_h_edge( } } -fn sse_h_edge( +fn sse_h_edge( blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], - pli: usize, bd: usize, xdec: usize, ydec: usize, + pli: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { @@ -1274,16 +1249,16 @@ fn sse_h_edge( match filter_size { 4 => { - sse_size4(&rec_region, &src_region, tally, true, bd); + sse_size4::<_, BD>(&rec_region, &src_region, tally, true); } 6 => { - sse_size6(&rec_region, &src_region, tally, true, bd); + sse_size6::<_, BD>(&rec_region, &src_region, tally, true); } 8 => { - sse_size8(&rec_region, &src_region, tally, true, bd); + sse_size8::<_, BD>(&rec_region, &src_region, tally, true); } 14 => { - sse_size14(&rec_region, &src_region, tally, true, bd); + sse_size14::<_, BD>(&rec_region, &src_region, tally, true); } _ => unreachable!(), } @@ -1293,9 +1268,9 @@ fn sse_h_edge( // Deblocks all edges, vertical and horizontal, in a single plane #[hawktracer(deblock_plane)] -pub fn deblock_plane( +pub fn deblock_plane( deblock: &DeblockState, p: &mut PlaneRegionMut, pli: usize, - blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, + blocks: &TileBlocks, crop_w: usize, crop_h: usize, ) { let xdec = p.plane_cfg.xdec; let ydec = p.plane_cfg.ydec; @@ -1339,26 +1314,24 @@ pub fn deblock_plane( // edge). Unroll to avoid corner-cases. if rows > 0 { for x in (1 << xdec..cols).step_by(1 << xdec) { - filter_v_edge( + filter_v_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { x, y: 0 }), p, pli, - bd, xdec, ydec, ); } if rows > 1 << ydec { for x in (1 << xdec..cols).step_by(1 << xdec) { - filter_v_edge( + filter_v_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { x, y: 1 << ydec }), p, pli, - bd, xdec, ydec, ); @@ -1371,13 +1344,12 @@ pub fn deblock_plane( for y in ((2 << ydec)..rows).step_by(1 << ydec) { // Check for vertical edge at first MI block boundary on this row if cols > 1 << xdec { - filter_v_edge( + filter_v_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { x: 1 << xdec, y }), p, pli, - bd, xdec, ydec, ); @@ -1385,17 +1357,16 @@ pub fn deblock_plane( // run the rest of the row with both vertical and horizontal edge filtering. // Horizontal lags vertical edge by one row and two columns. for x in (2 << xdec..cols).step_by(1 << xdec) { - filter_v_edge( + filter_v_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { x, y }), p, pli, - bd, xdec, ydec, ); - filter_h_edge( + filter_h_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { @@ -1404,14 +1375,13 @@ pub fn deblock_plane( }), p, pli, - bd, xdec, ydec, ); } // ..and the last two horizontal edges for the row if cols >= 2 << xdec { - filter_h_edge( + filter_h_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { @@ -1420,13 +1390,12 @@ pub fn deblock_plane( }), p, pli, - bd, xdec, ydec, ); } if cols >= 1 << xdec { - filter_h_edge( + filter_h_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { @@ -1435,7 +1404,6 @@ pub fn deblock_plane( }), p, pli, - bd, xdec, ydec, ); @@ -1445,13 +1413,12 @@ pub fn deblock_plane( // Last horizontal row, vertical is already complete if rows > 1 << ydec { for x in (0..cols).step_by(1 << xdec) { - filter_h_edge( + filter_h_edge::<_, BD>( deblock, blocks, TileBlockOffset(BlockOffset { x, y: rows - (1 << ydec) }), p, pli, - bd, xdec, ydec, ); @@ -1460,11 +1427,11 @@ pub fn deblock_plane( } // sse count of all edges in a single plane, accumulates into vertical and horizontal counts -fn sse_plane( +fn sse_plane( rec: &PlaneRegion, src: &PlaneRegion, v_sse: &mut [i64; MAX_LOOP_FILTER + 2], h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, blocks: &TileBlocks, - crop_w: usize, crop_h: usize, bd: usize, + crop_w: usize, crop_h: usize, ) { let xdec = rec.plane_cfg.xdec; let ydec = rec.plane_cfg.ydec; @@ -1485,14 +1452,13 @@ fn sse_plane( // No horizontal edge filtering along top of frame for x in (1 << xdec..cols).step_by(1 << xdec) { - sse_v_edge( + sse_v_edge::<_, BD>( blocks, TileBlockOffset(BlockOffset { x, y: 0 }), rec, src, v_sse, pli, - bd, xdec, ydec, ); @@ -1503,37 +1469,34 @@ fn sse_plane( // behind vertical. for y in (1 << ydec..rows).step_by(1 << ydec) { // No vertical filtering along left edge of frame - sse_h_edge( + sse_h_edge::<_, BD>( blocks, TileBlockOffset(BlockOffset { x: 0, y }), rec, src, h_sse, pli, - bd, xdec, ydec, ); for x in (1 << xdec..cols).step_by(1 << xdec) { - sse_v_edge( + sse_v_edge::<_, BD>( blocks, TileBlockOffset(BlockOffset { x, y }), rec, src, v_sse, pli, - bd, xdec, ydec, ); - sse_h_edge( + sse_h_edge::<_, BD>( blocks, TileBlockOffset(BlockOffset { x, y }), rec, src, h_sse, pli, - bd, xdec, ydec, ); @@ -1543,18 +1506,18 @@ fn sse_plane( // Deblocks all edges in all planes of a frame #[hawktracer(deblock_filter_frame)] -pub fn deblock_filter_frame( +pub fn deblock_filter_frame( deblock: &DeblockState, tile: &mut TileMut, blocks: &TileBlocks, - crop_w: usize, crop_h: usize, bd: usize, planes: usize, + crop_w: usize, crop_h: usize, planes: usize, ) { tile.planes[..planes].par_iter_mut().enumerate().for_each(|(pli, plane)| { - deblock_plane(deblock, plane, pli, blocks, crop_w, crop_h, bd); + deblock_plane::<_, BD>(deblock, plane, pli, blocks, crop_w, crop_h); }); } -fn sse_optimize( +fn sse_optimize( rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, - crop_h: usize, bd: usize, monochrome: bool, + crop_h: usize, monochrome: bool, ) -> [u8; 4] { // i64 allows us to accumulate a total of ~ 35 bits worth of pixels assert!( @@ -1569,7 +1532,7 @@ fn sse_optimize( let mut v_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; let mut h_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; - sse_plane( + sse_plane::<_, BD>( &rec.planes[pli], &input.planes[pli], &mut v_tally, @@ -1578,7 +1541,6 @@ fn sse_optimize( blocks, crop_w, crop_h, - bd, ); for i in 1..=MAX_LOOP_FILTER { @@ -1619,14 +1581,14 @@ fn sse_optimize( } #[hawktracer(deblock_filter_optimize)] -pub fn deblock_filter_optimize( +pub fn deblock_filter_optimize( fi: &FrameInvariants, rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, crop_h: usize, ) -> [u8; 4] { if fi.config.speed_settings.fast_deblock { - let q = ac_q(fi.base_q_idx, 0, fi.sequence.bit_depth).get() as i32; + let q = ac_q::(fi.base_q_idx, 0).get() as i32; let level = clamp( - match fi.sequence.bit_depth { + match BD { 8 => { if fi.frame_type == FrameType::KEY { (q * 17563 - 421_574 + (1 << 18 >> 1)) >> 18 @@ -1657,13 +1619,12 @@ pub fn deblock_filter_optimize( } else { // Deblocking happens in 4x4 (luma) units; luma x,y are clipped to // the *crop frame* of the entire frame by 4x4 block. - sse_optimize( + sse_optimize::<_, BD>( rec, input, blocks, crop_w, crop_h, - fi.sequence.bit_depth, fi.sequence.chroma_sampling == Cs400, ) } diff --git a/src/dist.rs b/src/dist.rs index 4b5536a841..aaf2e3e289 100644 --- a/src/dist.rs +++ b/src/dist.rs @@ -32,7 +32,7 @@ pub(crate) mod rust { /// w and h can be at most 128, the size of the largest block. pub fn get_sad( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, - h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, + h: usize, _cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(w <= 128 && h <= 128); let plane_org = @@ -157,7 +157,7 @@ pub(crate) mod rust { /// 4x4 transforms instead of 8x8 transforms when width or height < 8. pub fn get_satd( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, - h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, + h: usize, _cpu: CpuFeatureLevel, ) -> u32 { assert!(w <= 128 && h <= 128); assert!(plane_org.rect().width >= w && plane_org.rect().height >= h); @@ -186,9 +186,8 @@ pub(crate) mod rust { // Revert to sad on edge blocks (frame edges) if chunk_w != size || chunk_h != size { - sum += get_sad( - &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu, - ) as u64; + sum += + get_sad(&chunk_org, &chunk_ref, chunk_w, chunk_h, _cpu) as u64; continue; } @@ -235,8 +234,7 @@ pub(crate) mod rust { #[inline(never)] pub fn get_weighted_sse( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32], - scale_stride: usize, w: usize, h: usize, _bit_depth: usize, - _cpu: CpuFeatureLevel, + scale_stride: usize, w: usize, h: usize, _cpu: CpuFeatureLevel, ) -> u64 { let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h }); // Always chunk and apply scaling on the sse of squares the size of @@ -301,9 +299,9 @@ pub(crate) mod rust { /// Computes a distortion metric of the sum of squares weighted by activity. /// w and h should be <= 8. #[inline(never)] - pub fn cdef_dist_kernel( + pub fn cdef_dist_kernel( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, - bit_depth: usize, _cpu: CpuFeatureLevel, + _cpu: CpuFeatureLevel, ) -> u32 { // TODO: Investigate using different constants in ssim boost for block sizes // smaller than 8x8. @@ -370,7 +368,7 @@ pub(crate) mod rust { dvar = ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32; - apply_ssim_boost(sse, svar, dvar, bit_depth) + apply_ssim_boost::(sse, svar, dvar) } } @@ -442,7 +440,6 @@ pub mod test { (64, 16, 116384), ]; - let bit_depth: usize = 8; let (input_plane, rec_plane) = setup_planes::(); for (w, h, distortion) in blocks { @@ -453,14 +450,7 @@ pub mod test { assert_eq!( distortion, - get_sad( - &input_region, - &rec_region, - w, - h, - bit_depth, - CpuFeatureLevel::default() - ) + get_sad(&input_region, &rec_region, w, h, CpuFeatureLevel::default()) ); } } @@ -475,7 +465,7 @@ pub mod test { get_sad_same_inner::(); } - fn get_satd_same_inner() { + fn get_satd_same_inner() { let blocks: Vec<(usize, usize, u32)> = vec![ (4, 4, 1408), (4, 8, 2016), @@ -501,7 +491,6 @@ pub mod test { (64, 16, 21312), ]; - let bit_depth: usize = 8; let (input_plane, rec_plane) = setup_planes::(); for (w, h, distortion) in blocks { @@ -512,12 +501,11 @@ pub mod test { assert_eq!( distortion, - get_satd( + get_satd::<_, BD>( &input_region, &rec_region, w, h, - bit_depth, CpuFeatureLevel::default() ) ); @@ -526,11 +514,11 @@ pub mod test { #[test] fn get_satd_same_u8() { - get_satd_same_inner::(); + get_satd_same_inner::(); } #[test] fn get_satd_same_u16() { - get_satd_same_inner::(); + get_satd_same_inner::(); } } diff --git a/src/encoder.rs b/src/encoder.rs index 2b8d2ee80e..c2b45ee37f 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -570,12 +570,12 @@ pub struct SegmentationState { } impl SegmentationState { - pub fn update_threshold(&mut self, base_q_idx: u8, bd: usize) { - let base_ac_q = ac_q(base_q_idx, 0, bd).get() as u64; + pub fn update_threshold(&mut self, base_q_idx: u8) { + let base_ac_q = ac_q::(base_q_idx, 0).get() as u64; let real_ac_q = ArrayVec::<_, MAX_SEGMENTS>::from_iter( self.data[..=self.max_segment as usize].iter().map(|data| { - ac_q(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8, bd).get() - as u64 + ac_q::(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8) + .get() as u64 }), ); self.threshold.fill(DistortionScale(0)); @@ -1246,15 +1246,16 @@ impl FrameInvariants { (uv_f1 * CDEF_SEC_STRENGTHS as i32 + uv_f2) as u8; } - pub fn set_quantizers(&mut self, qps: &QuantizerParameters) { + pub fn set_quantizers( + &mut self, qps: &QuantizerParameters, + ) { self.base_q_idx = qps.ac_qi[0]; let base_q_idx = self.base_q_idx as i32; for pi in 0..3 { self.dc_delta_q[pi] = (qps.dc_qi[pi] as i32 - base_q_idx) as i8; self.ac_delta_q[pi] = (qps.ac_qi[pi] as i32 - base_q_idx) as i8; } - self.lambda = - qps.lambda * ((1 << (2 * (self.sequence.bit_depth - 8))) as f64); + self.lambda = qps.lambda * ((1 << (2 * (BD - 8))) as f64); self.me_lambda = self.lambda.sqrt(); self.dist_scale = qps.dist_scale.map(DistortionScale::from); @@ -1394,7 +1395,7 @@ fn get_qidx( /// /// - If the block size is invalid for subsampling /// - If a tx type other than DCT is used for 64x64 blocks -pub fn encode_tx_block( +pub fn encode_tx_block( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, @@ -1465,8 +1466,7 @@ pub fn encode_tx_block( let rec = &mut ts.rec.planes[p]; if mode.is_intra() { - let bit_depth = fi.sequence.bit_depth; - let edge_buf = get_intra_edges( + let edge_buf = get_intra_edges::<_, BD>( &rec.as_const(), tile_partition_bo, bx, @@ -1474,17 +1474,15 @@ pub fn encode_tx_block( bsize, po, tx_size, - bit_depth, Some(mode), fi.sequence.enable_intra_edge_filter, pred_intra_param, ); - mode.predict_intra( + mode.predict_intra::<_, BD>( tile_rect, &mut rec.subregion_mut(area), tx_size, - bit_depth, ac, pred_intra_param, ief_params, @@ -1536,13 +1534,12 @@ pub fn encode_tx_block( residual.fill(0); } - forward_transform( + forward_transform::<_, BD>( residual, coeffs, tx_size.width(), tx_size, tx_type, - fi.sequence.bit_depth, fi.cpu_feature_level, ); @@ -1579,13 +1576,12 @@ pub fn encode_tx_block( }; // Reconstruct - dequantize( + dequantize::<_, BD>( qidx, qcoeffs, eob, rcoeffs, tx_size, - fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], fi.cpu_feature_level, @@ -1594,13 +1590,12 @@ pub fn encode_tx_block( if eob == 0 { // All zero coefficients is a no-op } else if !fi.use_tx_domain_distortion || need_recon_pixel { - inverse_transform_add( + inverse_transform_add::<_, BD>( rcoeffs, &mut rec.subregion_mut(area), eob, tx_size, tx_type, - fi.sequence.bit_depth, fi.cpu_feature_level, ); } @@ -1654,7 +1649,7 @@ pub fn encode_tx_block( /// # Panics /// /// - If the block size is invalid for subsampling -pub fn motion_compensate( +pub fn motion_compensate( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, @@ -1713,7 +1708,7 @@ pub fn motion_compensate( }; if some_use_intra { - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1741,7 +1736,7 @@ pub fn motion_compensate( let area2 = Area::StartingAt { x: po2.x, y: po2.y }; let po3 = PlaneOffset { x: po.x + 2, y: po.y + 2 }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1753,7 +1748,7 @@ pub fn motion_compensate( mv0, compound_buffer, ); - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1765,7 +1760,7 @@ pub fn motion_compensate( mv1, compound_buffer, ); - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1777,7 +1772,7 @@ pub fn motion_compensate( mv2, compound_buffer, ); - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1793,7 +1788,7 @@ pub fn motion_compensate( if bsize == BlockSize::BLOCK_8X4 { let mv1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].mv; let rf1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].ref_frames; - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1807,7 +1802,7 @@ pub fn motion_compensate( ); let po3 = PlaneOffset { x: po.x, y: po.y + 2 }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1823,7 +1818,7 @@ pub fn motion_compensate( if bsize == BlockSize::BLOCK_4X8 { let mv2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].mv; let rf2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].ref_frames; - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1837,7 +1832,7 @@ pub fn motion_compensate( ); let po3 = PlaneOffset { x: po.x + 2, y: po.y }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1852,7 +1847,7 @@ pub fn motion_compensate( } } } else { - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, p, @@ -1922,7 +1917,7 @@ pub fn encode_block_pre_cdef( /// /// - If chroma and luma do not match for inter modes /// - If an invalid motion vector is found -pub fn encode_block_post_cdef( +pub fn encode_block_post_cdef( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, chroma_mode: PredictionMode, angle_delta: AngleDelta, @@ -2184,10 +2179,10 @@ pub fn encode_block_post_cdef( } if is_inter { - motion_compensate( + motion_compensate::<_, BD>( fi, ts, cw, luma_mode, ref_frames, mvs, bsize, tile_bo, false, ); - write_tx_tree( + write_tx_tree::<_, _, BD>( fi, ts, cw, @@ -2204,7 +2199,7 @@ pub fn encode_block_post_cdef( need_recon_pixel, ) } else { - write_tx_blocks( + write_tx_blocks::<_, _, BD>( fi, ts, cw, @@ -2228,7 +2223,7 @@ pub fn encode_block_post_cdef( /// # Panics /// /// - If attempting to encode a lossless block (not yet supported) -pub fn write_tx_blocks( +pub fn write_tx_blocks( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, chroma_mode: PredictionMode, angle_delta: AngleDelta, @@ -2249,14 +2244,7 @@ pub fn write_tx_blocks( let do_chroma = has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); - ts.qc.update( - qidx, - tx_size, - luma_mode.is_intra(), - fi.sequence.bit_depth, - fi.dc_delta_q[0], - 0, - ); + ts.qc.update::(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0); for by in 0..bh { for bx in 0..bw { @@ -2268,7 +2256,7 @@ pub fn write_tx_blocks( continue; } let po = tx_bo.plane_offset(&ts.input.planes[0].cfg); - let (has_coeff, dist) = encode_tx_block( + let (has_coeff, dist) = encode_tx_block::<_, _, BD>( fi, ts, cw, @@ -2333,11 +2321,10 @@ pub fn write_tx_blocks( }; for p in 1..3 { - ts.qc.update( + ts.qc.update::( qidx, uv_tx_size, true, - fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], ); @@ -2354,7 +2341,7 @@ pub fn write_tx_blocks( let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg); po.x += (bx * uv_tx_size.width()) as isize; po.y += (by * uv_tx_size.height()) as isize; - let (has_coeff, dist) = encode_tx_block( + let (has_coeff, dist) = encode_tx_block::<_, _, BD>( fi, ts, cw, @@ -2389,7 +2376,7 @@ pub fn write_tx_blocks( (partition_has_coeff, tx_dist) } -pub fn write_tx_tree( +pub fn write_tx_tree( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, angle_delta_y: i8, tile_bo: TileBlockOffset, bsize: BlockSize, @@ -2408,14 +2395,7 @@ pub fn write_tx_tree( let mut partition_has_coeff: bool = false; let mut tx_dist = ScaledDistortion::zero(); - ts.qc.update( - qidx, - tx_size, - luma_mode.is_intra(), - fi.sequence.bit_depth, - fi.dc_delta_q[0], - 0, - ); + ts.qc.update::(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0); // TODO: If tx-parition more than only 1-level, this code does not work. // It should recursively traverse the tx block that are split recursivelty by calling write_tx_tree(), @@ -2431,7 +2411,7 @@ pub fn write_tx_tree( } let po = tx_bo.plane_offset(&ts.input.planes[0].cfg); - let (has_coeff, dist) = encode_tx_block( + let (has_coeff, dist) = encode_tx_block::<_, _, BD>( fi, ts, cw, @@ -2494,11 +2474,10 @@ pub fn write_tx_tree( }; for p in 1..3 { - ts.qc.update( + ts.qc.update::( qidx, uv_tx_size, false, - fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], ); @@ -2515,7 +2494,7 @@ pub fn write_tx_tree( let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg); po.x += (bx * uv_tx_size.width()) as isize; po.y += (by * uv_tx_size.height()) as isize; - let (has_coeff, dist) = encode_tx_block( + let (has_coeff, dist) = encode_tx_block::<_, _, BD>( fi, ts, cw, @@ -2546,7 +2525,7 @@ pub fn write_tx_tree( (partition_has_coeff, tx_dist) } -pub fn encode_block_with_modes( +pub fn encode_block_with_modes( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, @@ -2572,7 +2551,7 @@ pub fn encode_block_with_modes( let (tx_size, tx_type) = if !mode_decision.skip && !mode_decision.has_coeff { skip = true; - rdo_tx_size_type( + rdo_tx_size_type::<_, BD>( fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip, ) } else { @@ -2588,7 +2567,7 @@ pub fn encode_block_with_modes( tile_bo, skip, ); - encode_block_post_cdef( + encode_block_post_cdef::<_, _, BD>( fi, ts, cw, @@ -2612,7 +2591,7 @@ pub fn encode_block_with_modes( ); } -fn encode_partition_bottomup( +fn encode_partition_bottomup( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, ref_rd_cost: f64, @@ -2673,7 +2652,7 @@ fn encode_partition_bottomup( }; let mode_decision = - rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); + rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg); if !mode_decision.pred_mode_luma.is_intra() { // Fill the saved motion structure @@ -2693,7 +2672,7 @@ fn encode_partition_bottomup( rdo_output.part_modes.push(mode_decision.clone()); if !can_split { - encode_block_with_modes( + encode_block_with_modes::<_, _, BD>( fi, ts, cw, @@ -2783,7 +2762,7 @@ fn encode_partition_bottomup( if offset.0.x >= ts.mi_width || offset.0.y >= ts.mi_height { continue; } - let child_rdo_output = encode_partition_bottomup( + let child_rdo_output = encode_partition_bottomup::<_, _, BD>( fi, ts, cw, @@ -2856,7 +2835,7 @@ fn encode_partition_bottomup( } // FIXME: redundant block re-encode - encode_block_with_modes( + encode_block_with_modes::<_, _, BD>( fi, ts, cw, @@ -2895,7 +2874,7 @@ fn encode_partition_bottomup( rdo_output } -fn encode_partition_topdown( +fn encode_partition_topdown( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, @@ -2939,7 +2918,7 @@ fn encode_partition_topdown( debug_assert!(bsize.is_sqr()); // Blocks of sizes within the supported range are subjected to a partitioning decision - rdo_output = rdo_partition_decision( + rdo_output = rdo_partition_decision::<_, _, BD>( fi, ts, cw, @@ -2977,7 +2956,7 @@ fn encode_partition_topdown( } else { // Make a prediction mode decision for blocks encoded with no rdo_partition_decision call (e.g. edges) rdo_decision = - rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); + rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg); &rdo_decision }; @@ -2997,7 +2976,7 @@ fn encode_partition_topdown( // NOTE: Cannot avoid calling rdo_tx_size_type() here again, // because, with top-down partition RDO, the neighboring contexts // of current partition can change, i.e. neighboring partitions can split down more. - let (tx_size, tx_type) = rdo_tx_size_type( + let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>( fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip, ); @@ -3105,7 +3084,7 @@ fn encode_partition_topdown( tile_bo, skip, ); - encode_block_post_cdef( + encode_block_post_cdef::<_, _, BD>( fi, ts, cw, @@ -3135,7 +3114,7 @@ fn encode_partition_topdown( // The optimal prediction modes for each split block is known from an rdo_partition_decision() call for mode in rdo_output.part_modes { // Each block is subjected to a new splitting decision - encode_partition_topdown( + encode_partition_topdown::<_, _, BD>( fi, ts, cw, @@ -3174,7 +3153,7 @@ fn encode_partition_topdown( let partitions = get_sub_partitions(&four_partitions, partition); partitions.iter().for_each(|&offset| { - encode_partition_topdown( + encode_partition_topdown::<_, _, BD>( fi, ts, cw, @@ -3215,7 +3194,7 @@ fn get_initial_cdfcontext(fi: &FrameInvariants) -> CDFContext { } #[hawktracer(encode_tile_group)] -fn encode_tile_group( +fn encode_tile_group( fi: &FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) -> Vec { let planes = @@ -3233,7 +3212,7 @@ fn encode_tile_group( .collect::>() .into_par_iter() .map(|(mut ctx, cdf)| { - encode_tile(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg) + encode_tile::<_, BD>(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg) }) .unzip(); @@ -3247,7 +3226,7 @@ fn encode_tile_group( /* TODO: Don't apply if lossless */ let levels = fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; - deblock_filter_optimize( + deblock_filter_optimize::<_, _, BD>( fi, &rec.as_const(), &ts.input.as_tile(), @@ -3261,13 +3240,12 @@ fn encode_tile_group( if fs.deblock.levels[0] != 0 || fs.deblock.levels[1] != 0 { fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; - deblock_filter_frame( + deblock_filter_frame::<_, BD>( ts.deblock, rec, &blocks.as_tile_blocks(), fi.width, fi.height, - fi.sequence.bit_depth, planes, ); }); @@ -3282,11 +3260,16 @@ fn encode_tile_group( if fi.sequence.enable_cdef { fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; - cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec); + cdef_filter_tile::<_, BD>( + fi, + &deblocked_frame, + &blocks.as_tile_blocks(), + rec, + ); }); } /* TODO: Don't apply if lossless */ - fs.restoration.lrf_filter_frame( + fs.restoration.lrf_filter_frame::<_, BD>( Arc::get_mut(&mut fs.rec).unwrap(), &deblocked_frame, fi, @@ -3297,7 +3280,12 @@ fn encode_tile_group( let deblocked_frame = (*fs.rec).clone(); fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; - cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec); + cdef_filter_tile::<_, BD>( + fi, + &deblocked_frame, + &blocks.as_tile_blocks(), + rec, + ); }); } } @@ -3353,7 +3341,7 @@ pub struct SBSQueueEntry { pub w_post_cdef: WriterBase, } -fn check_lf_queue( +fn check_lf_queue( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut WriterBase, sbs_q: &mut VecDeque, last_lru_ready: &mut [i32; 3], @@ -3405,7 +3393,7 @@ fn check_lf_queue( } } if !already_rdoed { - rdo_loop_decision(qe.sbo, fi, ts, cw, w, deblock_p); + rdo_loop_decision::<_, _, BD>(qe.sbo, fi, ts, cw, w, deblock_p); for pli in 0..planes { if qe.lru_index[pli] != -1 && last_lru_rdoed[pli] < qe.lru_index[pli] @@ -3445,7 +3433,7 @@ fn check_lf_queue( } #[hawktracer(encode_tile)] -fn encode_tile<'a, T: Pixel>( +fn encode_tile<'a, T: Pixel, const BD: usize>( fi: &FrameInvariants, ts: &'a mut TileStateMut<'_, T>, fc: &'a mut CDFContext, blocks: &'a mut TileBlocksMut<'a>, inter_cfg: &InterConfig, @@ -3492,7 +3480,7 @@ fn encode_tile<'a, T: Pixel>( || is_straddle_sbx || is_straddle_sby { - encode_partition_bottomup( + encode_partition_bottomup::<_, _, BD>( fi, ts, &mut cw, @@ -3505,7 +3493,7 @@ fn encode_tile<'a, T: Pixel>( &mut enc_stats, ); } else { - encode_partition_topdown( + encode_partition_topdown::<_, _, BD>( fi, ts, &mut cw, @@ -3547,7 +3535,7 @@ fn encode_tile<'a, T: Pixel>( sbs_q.push_back(sbs_qe); if check_queue && !fi.sequence.enable_delayed_loopfilter_rdo { - check_lf_queue( + check_lf_queue::<_, BD>( fi, ts, &mut cw, @@ -3566,7 +3554,7 @@ fn encode_tile<'a, T: Pixel>( if fi.sequence.enable_delayed_loopfilter_rdo { // Solve deblocking for just this tile /* TODO: Don't apply if lossless */ - let deblock_levels = deblock_filter_optimize( + let deblock_levels = deblock_filter_optimize::<_, _, BD>( fi, &ts.rec.as_const(), &ts.input_tile, @@ -3592,18 +3580,17 @@ fn encode_tile<'a, T: Pixel>( deblock_copy.levels = deblock_levels; // temporarily deblock the reference - deblock_filter_frame( + deblock_filter_frame::<_, BD>( &deblock_copy, &mut ts.rec, &cw.bc.blocks.as_const(), fi.width, fi.height, - fi.sequence.bit_depth, planes, ); // rdo lf and write - check_lf_queue( + check_lf_queue::<_, BD>( fi, ts, &mut cw, @@ -3627,7 +3614,7 @@ fn encode_tile<'a, T: Pixel>( } } else { // rdo lf and write - check_lf_queue( + check_lf_queue::<_, BD>( fi, ts, &mut cw, @@ -3743,7 +3730,7 @@ fn get_initial_segmentation( /// # Panics /// /// - If the frame packets cannot be written -pub fn encode_frame( +pub fn encode_frame( fi: &FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) -> Vec { debug_assert!(!fi.is_show_existing_frame()); @@ -3753,9 +3740,9 @@ pub fn encode_frame( if fi.enable_segmentation { fs.segmentation = get_initial_segmentation(fi); - segmentation_optimize(fi, fs); + segmentation_optimize::<_, BD>(fi, fs); } - let tile_group = encode_tile_group(fi, fs, inter_cfg); + let tile_group = encode_tile_group::<_, BD>(fi, fs, inter_cfg); if fi.frame_type == FrameType::KEY { write_key_frame_obus(&mut packet, fi, obu_extension).unwrap(); diff --git a/src/lrf.rs b/src/lrf.rs index f33a48826e..c793310498 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -626,7 +626,7 @@ pub fn setup_integral_image( } } -pub fn sgrproj_stripe_filter( +pub fn sgrproj_stripe_filter( set: u8, xqd: [i8; 2], fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, @@ -647,19 +647,6 @@ pub fn sgrproj_stripe_filter( let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; - let fn_ab_r1 = match fi.sequence.bit_depth { - 8 => sgrproj_box_ab_r1::<8>, - 10 => sgrproj_box_ab_r1::<10>, - 12 => sgrproj_box_ab_r1::<12>, - _ => unimplemented!(), - }; - let fn_ab_r2 = match fi.sequence.bit_depth { - 8 => sgrproj_box_ab_r2::<8>, - 10 => sgrproj_box_ab_r2::<10>, - 12 => sgrproj_box_ab_r2::<12>, - _ => unimplemented!(), - }; - /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation @@ -668,7 +655,7 @@ pub fn sgrproj_stripe_filter( let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { - fn_ab_r2( + sgrproj_box_ab_r2::( &mut a_r2[0], &mut b_r2[0], integral_image, @@ -682,7 +669,7 @@ pub fn sgrproj_stripe_filter( } if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], @@ -693,7 +680,7 @@ pub fn sgrproj_stripe_filter( s_r1, fi.cpu_feature_level, ); - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], @@ -712,7 +699,7 @@ pub fn sgrproj_stripe_filter( for y in (0..stripe_h).step_by(2) { // get results to use y and y+1 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 { - fn_ab_r2( + sgrproj_box_ab_r2::( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, @@ -751,7 +738,7 @@ pub fn sgrproj_stripe_filter( let y = y + dy; if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], @@ -793,9 +780,9 @@ pub fn sgrproj_stripe_filter( let line = &cdeffed[y]; #[inline(always)] - fn apply_filter( + fn apply_filter( out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32], - stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32, + stripe_w: usize, w0: i32, w1: i32, w2: i32, ) { let line_it = line[..stripe_w].iter(); let f_r2_ab_it = f_r2_ab[..stripe_w].iter(); @@ -809,17 +796,16 @@ pub fn sgrproj_stripe_filter( let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32; let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1)) >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); - *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1)); + *o = U::cast_from(clamp(s, 0, (1 << BD) - 1)); } } - apply_filter( + apply_filter::<_, BD>( &mut out[y], line, &f_r1, f_r2_ab[dy], stripe_w, - fi.sequence.bit_depth, w0, w1, w2, @@ -842,7 +828,7 @@ pub fn sgrproj_stripe_filter( // Input params follow the same rules as sgrproj_stripe_filter. // Inputs are relative to the colocated slice views. -pub fn sgrproj_solve( +pub fn sgrproj_solve( set: u8, fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, @@ -865,19 +851,6 @@ pub fn sgrproj_solve( let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]]; let mut c: [f64; 2] = [0., 0.]; - let fn_ab_r1 = match fi.sequence.bit_depth { - 8 => sgrproj_box_ab_r1::<8>, - 10 => sgrproj_box_ab_r1::<10>, - 12 => sgrproj_box_ab_r1::<12>, - _ => unimplemented!(), - }; - let fn_ab_r2 = match fi.sequence.bit_depth { - 8 => sgrproj_box_ab_r2::<8>, - 10 => sgrproj_box_ab_r2::<10>, - 12 => sgrproj_box_ab_r2::<12>, - _ => unimplemented!(), - }; - /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation @@ -886,7 +859,7 @@ pub fn sgrproj_solve( let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { - fn_ab_r2( + sgrproj_box_ab_r2::( &mut a_r2[0], &mut b_r2[0], integral_image, @@ -900,7 +873,7 @@ pub fn sgrproj_solve( } if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], @@ -911,7 +884,7 @@ pub fn sgrproj_solve( s_r1, fi.cpu_feature_level, ); - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], @@ -930,7 +903,7 @@ pub fn sgrproj_solve( for y in (0..cdef_h).step_by(2) { // get results to use y and y+1 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 { - fn_ab_r2( + sgrproj_box_ab_r2::( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, @@ -963,7 +936,7 @@ pub fn sgrproj_solve( let y = y + dy; if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; - fn_ab_r1( + sgrproj_box_ab_r1::( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], @@ -1093,16 +1066,15 @@ pub fn sgrproj_solve( } } -fn wiener_stripe_filter( - coeffs: [[i8; 3]; 2], fi: &FrameInvariants, crop_w: usize, crop_h: usize, - stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize, - cdeffed: &Plane, deblocked: &Plane, out: &mut Plane, +fn wiener_stripe_filter( + coeffs: [[i8; 3]; 2], crop_w: usize, crop_h: usize, stripe_w: usize, + stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane, + deblocked: &Plane, out: &mut Plane, ) { - let bit_depth = fi.sequence.bit_depth; - let round_h = if bit_depth == 12 { 5 } else { 3 }; - let round_v = if bit_depth == 12 { 9 } else { 11 }; - let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1); - let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1; + let round_h = if BD == 12 { 5 } else { 3 }; + let round_v = if BD == 12 { 9 } else { 11 }; + let offset = 1 << (BD + WIENER_BITS - round_h - 1); + let limit = (1 << (BD + 1 + WIENER_BITS - round_h)) - 1; let mut coeffs_ = [[0; 3]; 2]; for i in 0..2 { @@ -1197,7 +1169,7 @@ fn wiener_stripe_filter( *dst = T::cast_from(clamp( (acc + (1 << round_v >> 1)) >> round_v, 0, - (1 << bit_depth) - 1, + (1 << BD) - 1, )); } } @@ -1482,7 +1454,7 @@ impl RestorationState { } #[hawktracer(lrf_filter_frame)] - pub fn lrf_filter_frame( + pub fn lrf_filter_frame( &mut self, out: &mut Frame, pre_cdef: &Frame, fi: &FrameInvariants, ) { @@ -1530,9 +1502,8 @@ impl RestorationState { let ru = rp.restoration_unit_by_stripe(si, rux); match ru.filter { RestorationFilter::Wiener { coeffs } => { - wiener_stripe_filter( + wiener_stripe_filter::<_, BD>( coeffs, - fi, crop_w, crop_h, size, @@ -1562,7 +1533,7 @@ impl RestorationState { .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), ); - sgrproj_stripe_filter( + sgrproj_stripe_filter::<_, _, BD>( set, xqd, fi, diff --git a/src/mc.rs b/src/mc.rs index d9edde259b..45981cc6c9 100644 --- a/src/mc.rs +++ b/src/mc.rs @@ -247,10 +247,10 @@ pub(crate) mod rust { } #[cold_for_target_arch("x86_64")] - pub fn put_8tap( + pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, - mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel, + mode_y: FilterMode, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); @@ -259,8 +259,8 @@ pub(crate) mod rust { let ref_stride = src.plane.cfg.stride; let y_filter = get_filter(mode_y, row_frac, height); let x_filter = get_filter(mode_x, col_frac, width); - let max_sample_val = (1 << bit_depth) - 1; - let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; + let max_sample_val = (1 << BD) - 1; + let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 }; match (col_frac, row_frac) { (0, 0) => { for r in 0..height { @@ -357,10 +357,10 @@ pub(crate) mod rust { const PREP_BIAS: i32 = 8192; #[cold_for_target_arch("x86_64")] - pub fn prep_8tap( + pub fn prep_8tap( tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, - bit_depth: usize, _cpu: CpuFeatureLevel, + _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); @@ -369,8 +369,8 @@ pub(crate) mod rust { let ref_stride = src.plane.cfg.stride; let y_filter = get_filter(mode_y, row_frac, height); let x_filter = get_filter(mode_x, col_frac, width); - let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; - let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS }; + let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 }; + let prep_bias = if BD == 8 { 0 } else { PREP_BIAS }; match (col_frac, row_frac) { (0, 0) => { for r in 0..height { @@ -451,17 +451,17 @@ pub(crate) mod rust { } #[cold_for_target_arch("x86_64")] - pub fn mc_avg( + pub fn mc_avg( dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, - height: usize, bit_depth: usize, _cpu: CpuFeatureLevel, + height: usize, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); - let max_sample_val = (1 << bit_depth) - 1; - let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; - let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 }; + let max_sample_val = (1 << BD) - 1; + let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 }; + let prep_bias = if BD == 8 { 0 } else { PREP_BIAS * 2 }; for r in 0..height { let dst_slice = &mut dst[r]; for c in 0..width { diff --git a/src/me.rs b/src/me.rs index a6b09e9f03..ecbcdfa398 100644 --- a/src/me.rs +++ b/src/me.rs @@ -154,7 +154,7 @@ pub enum MVSamplingMode { } #[hawktracer(estimate_tile_motion)] -pub fn estimate_tile_motion( +pub fn estimate_tile_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, inter_cfg: &InterConfig, ) { @@ -194,7 +194,7 @@ pub fn estimate_tile_motion( .block_offset(0, 0); if new_subsampling { - refine_subsampled_sb_motion( + refine_subsampled_sb_motion::<_, BD>( fi, ts, ref_frame, @@ -205,7 +205,7 @@ pub fn estimate_tile_motion( ); } - estimate_sb_motion( + estimate_sb_motion::<_, BD>( fi, ts, ref_frame, @@ -221,7 +221,7 @@ pub fn estimate_tile_motion( } } -fn estimate_sb_motion( +fn estimate_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8, lambda: u32, @@ -257,7 +257,7 @@ fn estimate_sb_motion( // Run motion estimation. // Note that the initial search (init) instructs the called function to // perform a more extensive search. - if let Some(results) = estimate_motion( + if let Some(results) = estimate_motion::<_, BD>( fi, ts, w, @@ -285,7 +285,7 @@ fn estimate_sb_motion( } } -fn refine_subsampled_sb_motion( +fn refine_subsampled_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32, ) { @@ -307,7 +307,7 @@ fn refine_subsampled_sb_motion( let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; // Refine the existing motion estimate - if let Some(results) = refine_subsampled_motion_estimate( + if let Some(results) = refine_subsampled_motion_estimate::<_, BD>( fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda, ) { // normalize sad to 128x128 block @@ -536,7 +536,7 @@ fn get_subset_predictors( MotionEstimationSubsets { min_sad, median, subset_b, subset_c } } -pub fn estimate_motion( +pub fn estimate_motion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode, @@ -575,7 +575,7 @@ pub fn estimate_motion( _ => unimplemented!(), }; - let mut best: MotionSearchResult = full_pixel_me( + let mut best: MotionSearchResult = full_pixel_me::<_, BD>( fi, ts, org_region, @@ -599,26 +599,13 @@ pub fn estimate_motion( if let Some(pmv) = pmv { let use_satd: bool = fi.config.speed_settings.motion.use_satd_subpel; if use_satd { - best.rd = get_fullpel_mv_rd( - fi, - po, - org_region, - p_ref, - fi.sequence.bit_depth, - pmv, - lambda, - use_satd, - mvx_min, - mvx_max, - mvy_min, - mvy_max, - w, - h, - best.mv, + best.rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, use_satd, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, best.mv, ); } - sub_pixel_me( + sub_pixel_me::<_, BD>( fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min, mvy_max, w, h, use_satd, &mut best, ref_frame, ); @@ -634,7 +621,7 @@ pub fn estimate_motion( } /// Refine motion estimation that was computed one level of subsampling up. -fn refine_subsampled_motion_estimate( +fn refine_subsampled_motion_estimate( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32, ) -> Option { @@ -679,7 +666,7 @@ fn refine_subsampled_motion_estimate( let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8); let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8); let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8); - let mut results = full_search( + let mut results = full_search::<_, BD>( fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv, ); @@ -692,7 +679,7 @@ fn refine_subsampled_motion_estimate( } } -fn full_pixel_me( +fn full_pixel_me( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, org_region: &PlaneRegion, p_ref: &Plane, tile_bo: TileBlockOffset, po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize, @@ -722,29 +709,16 @@ fn full_pixel_me( let try_cands = |predictors: &[MotionVector], best: &mut MotionSearchResult| { - let mut results = get_best_predictor( - fi, - po, - org_region, - p_ref, - predictors, - fi.sequence.bit_depth, - pmv, - lambda, - mvx_min, - mvx_max, - mvy_min, - mvy_max, - w, - h, + let mut results = get_best_predictor::<_, BD>( + fi, po, org_region, p_ref, predictors, pmv, lambda, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, ); - fullpel_diamond_search( + fullpel_diamond_search::<_, BD>( fi, po, org_region, p_ref, &mut results, - fi.sequence.bit_depth, pmv, lambda, mvx_min, @@ -770,8 +744,8 @@ fn full_pixel_me( // from the previous frame. Stop once a candidate with a sad less than a // threshold is found. - let thresh = (subsets.min_sad as f32 * 1.2) as u32 - + (((w * h) as u32) << (fi.sequence.bit_depth - 8)); + let thresh = + (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (BD - 8)); if let Some(median) = subsets.median { try_cands(&[median], &mut best); @@ -795,21 +769,9 @@ fn full_pixel_me( // Preform UMH search, either as the last possible search when full search // is disabled, or as the last search before resorting to full search. - uneven_multi_hex_search( - fi, - po, - org_region, - p_ref, - &mut best, - fi.sequence.bit_depth, - pmv, - lambda, - mvx_min, - mvx_max, - mvy_min, - mvy_max, - w, - h, + uneven_multi_hex_search::<_, BD>( + fi, po, org_region, p_ref, &mut best, pmv, lambda, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, // Use 24, since it is the largest range that x264 uses. 24, ); @@ -829,7 +791,7 @@ fn full_pixel_me( let y_lo = po.y + (-range_y).max(mvy_min / 8); let y_hi = po.y + (range_y).min(mvy_max / 8); - let results = full_search( + let results = full_search::<_, BD>( fi, x_lo, x_hi, @@ -857,44 +819,30 @@ fn full_pixel_me( } } -fn sub_pixel_me( +fn sub_pixel_me( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType, ) { - subpel_diamond_search( - fi, - po, - org_region, - p_ref, - fi.sequence.bit_depth, - pmv, - lambda, - mvx_min, - mvx_max, - mvy_min, - mvy_max, - w, - h, - use_satd, - best, - ref_frame, + subpel_diamond_search::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, mvx_min, mvx_max, mvy_min, + mvy_max, w, h, use_satd, best, ref_frame, ); } -fn get_best_predictor( +fn get_best_predictor( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - p_ref: &Plane, predictors: &[MotionVector], bit_depth: usize, - pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, - mvy_min: isize, mvy_max: isize, w: usize, h: usize, + p_ref: &Plane, predictors: &[MotionVector], pmv: [MotionVector; 2], + lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, + w: usize, h: usize, ) -> MotionSearchResult { let mut best: MotionSearchResult = MotionSearchResult::empty(); for &init_mv in predictors.iter() { - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, init_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, init_mv, ); if rd.cost < best.rd.cost { @@ -953,11 +901,11 @@ const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!( /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. -fn fullpel_diamond_search( +fn fullpel_diamond_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, - pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, - mvy_min: isize, mvy_max: isize, w: usize, h: usize, + p_ref: &Plane, current: &mut MotionSearchResult, pmv: [MotionVector; 2], + lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, + w: usize, h: usize, ) { // Define the initial and the final scale (log2) of the diamond. let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8); @@ -967,9 +915,9 @@ fn fullpel_diamond_search( let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &DIAMOND_R1_PATTERN { let cand_mv = current.mv + (offset << diamond_radius_log2); - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { @@ -1052,11 +1000,11 @@ const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!( /// /// `current` provides the initial search location and serves as /// the output for the final search results. -fn hexagon_search( +fn hexagon_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, - pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, - mvy_min: isize, mvy_max: isize, w: usize, h: usize, + p_ref: &Plane, current: &mut MotionSearchResult, pmv: [MotionVector; 2], + lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, + w: usize, h: usize, ) { // The first iteration of hexagon search is implemented separate from // subsequent iterations, which overlap with previous iterations. @@ -1070,9 +1018,9 @@ fn hexagon_search( // First iteration of hexagon search. There are six candidates to consider. for i in 0..6 { let cand_mv = current.mv + HEXAGON_PATTERN[i]; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { @@ -1102,9 +1050,9 @@ fn hexagon_search( let i = (center_cand_idx + idx_offset_mod6) % 6; let cand_mv = current.mv + HEXAGON_PATTERN[i]; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { @@ -1119,9 +1067,9 @@ fn hexagon_search( let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &SQUARE_REFINE_PATTERN { let cand_mv = current.mv + offset; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { @@ -1166,11 +1114,11 @@ const UMH_PATTERN: [MotionVector; 16] = search_pattern!( /// the output for the final search results. /// /// `me_range` parameter determines how far these stages can search. -fn uneven_multi_hex_search( +fn uneven_multi_hex_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, - pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, - mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16, + p_ref: &Plane, current: &mut MotionSearchResult, pmv: [MotionVector; 2], + lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, + w: usize, h: usize, me_range: i16, ) { assert!(!current.is_empty()); @@ -1199,9 +1147,9 @@ fn uneven_multi_hex_search( for &offset in &HORIZONTAL_LINE { let cand_mv = center + offset * i; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { @@ -1220,9 +1168,9 @@ fn uneven_multi_hex_search( for &offset in &VERTICAL_LINE { let cand_mv = center + offset * i; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { @@ -1240,9 +1188,9 @@ fn uneven_multi_hex_search( continue; } let cand_mv = center + MotionVector { row, col }; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { @@ -1282,9 +1230,9 @@ fn uneven_multi_hex_search( for i in 1..=iterations { for &offset in &UMH_PATTERN { let cand_mv = center + offset * i; - let rd = get_fullpel_mv_rd( - fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, cand_mv, + let rd = get_fullpel_mv_rd::<_, BD>( + fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { @@ -1295,9 +1243,9 @@ fn uneven_multi_hex_search( } // Refine the search results using a 'normal' hexagon search. - hexagon_search( - fi, po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min, - mvx_max, mvy_min, mvy_max, w, h, + hexagon_search::<_, BD>( + fi, po, org_region, p_ref, current, pmv, lambda, mvx_min, mvx_max, + mvy_min, mvy_max, w, h, ); } @@ -1306,12 +1254,11 @@ fn uneven_multi_hex_search( /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. -fn subpel_diamond_search( +fn subpel_diamond_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - _p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, - mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, - h: usize, use_satd: bool, current: &mut MotionSearchResult, - ref_frame: RefType, + _p_ref: &Plane, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, + mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, + use_satd: bool, current: &mut MotionSearchResult, ref_frame: RefType, ) { use crate::util::Aligned; @@ -1340,11 +1287,10 @@ fn subpel_diamond_search( for &offset in &DIAMOND_R1_PATTERN_SUBPEL { let cand_mv = current.mv + (offset << diamond_radius_log2); - let rd = get_subpel_mv_rd( + let rd = get_subpel_mv_rd::<_, BD>( fi, po, org_region, - bit_depth, pmv, lambda, use_satd, @@ -1381,11 +1327,11 @@ fn subpel_diamond_search( } #[inline] -fn get_fullpel_mv_rd( +fn get_fullpel_mv_rd( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, - use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, - mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, + p_ref: &Plane, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, + mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, + h: usize, cand_mv: MotionVector, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min || (cand_mv.col as isize) > mvx_max @@ -1400,17 +1346,16 @@ fn get_fullpel_mv_rd( x: po.x + (cand_mv.col / 8) as isize, y: po.y + (cand_mv.row / 8) as isize, }); - compute_mv_rd( - fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, - &plane_ref, + compute_mv_rd::<_, BD>( + fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref, ) } -fn get_subpel_mv_rd( +fn get_subpel_mv_rd( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, - bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, - mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, - h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut, + pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize, + mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, + cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut, ref_frame: RefType, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min @@ -1426,29 +1371,28 @@ fn get_subpel_mv_rd( let tile_rect = TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height }; - PredictionMode::NEWMV.predict_inter_single( + PredictionMode::NEWMV.predict_inter_single::<_, BD>( fi, tile_rect, 0, po, tmp_region, // motion comp's w & h on edges can be different than distortion's tmp_width, tmp_height, ref_frame, cand_mv, ); let plane_ref = tmp_region.as_const(); - compute_mv_rd( - fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, - &plane_ref, + compute_mv_rd::<_, BD>( + fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref, ) } /// Compute the rate distortion stats for a motion vector. #[inline(always)] -fn compute_mv_rd( +fn compute_mv_rd( fi: &FrameInvariants, pmv: [MotionVector; 2], lambda: u32, - use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector, + use_satd: bool, w: usize, h: usize, cand_mv: MotionVector, plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, ) -> MVCandidateRD { let sad = if use_satd { - get_satd(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level) + get_satd::<_, BD>(plane_org, plane_ref, w, h, fi.cpu_feature_level) } else { - get_sad(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level) + get_sad(plane_org, plane_ref, w, h, fi.cpu_feature_level) }; let rate1 = get_mv_rate(cand_mv, pmv[0], fi.allow_high_precision_mv); @@ -1458,7 +1402,7 @@ fn compute_mv_rd( MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad } } -fn full_search( +fn full_search( fi: &FrameInvariants, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, w: usize, h: usize, org_region: &PlaneRegion, p_ref: &Plane, po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2], @@ -1482,12 +1426,11 @@ fn full_search( col: 8 * (x as i16 - po.x as i16), }; - let rd = compute_mv_rd( + let rd = compute_mv_rd::<_, BD>( fi, pmv, lambda, false, - fi.sequence.bit_depth, w, h, mv, diff --git a/src/partition.rs b/src/partition.rs index e64de6e9ad..22b800b646 100644 --- a/src/partition.rs +++ b/src/partition.rs @@ -591,7 +591,7 @@ fn supersample_chroma_bsize( } } -pub fn get_intra_edges( +pub fn get_intra_edges( dst: &PlaneRegion<'_, T>, partition_bo: TileBlockOffset, // partition bo, BlockOffset bx: usize, @@ -599,7 +599,6 @@ pub fn get_intra_edges( partition_size: BlockSize, // partition size, BlockSize po: PlaneOffset, tx_size: TxSize, - bit_depth: usize, opt_mode: Option, enable_intra_edge_filter: bool, intra_param: IntraParam, @@ -610,7 +609,7 @@ pub fn get_intra_edges( let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> = unsafe { Aligned::uninitialized() }; //Aligned::new([T::cast_from(0); 4 * MAX_TX_SIZE + 1]); - let base = 128u16 << (bit_depth - 8); + let base = 128u16 << (BD - 8); { // left pixels are ordered from bottom to top and right-aligned diff --git a/src/predict.rs b/src/predict.rs index 632196c72a..0c3098233d 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -205,9 +205,9 @@ impl PredictionMode { /// # Panics /// /// - If called on an inter `PredictionMode` - pub fn predict_intra( + pub fn predict_intra( self, tile_rect: TileRect, dst: &mut PlaneRegionMut<'_, T>, - tx_size: TxSize, bit_depth: usize, ac: &[i16], intra_param: IntraParam, + tx_size: TxSize, ac: &[i16], intra_param: IntraParam, ief_params: Option, edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel, ) { @@ -245,9 +245,8 @@ impl PredictionMode { _ => intra_mode_to_angle(mode) + (angle_delta * ANGLE_STEP) as isize, }; - dispatch_predict_intra::( - mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf, - cpu, + dispatch_predict_intra::( + mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu, ); } @@ -304,7 +303,7 @@ impl PredictionMode { /// # Panics /// /// - If called on an intra `PredictionMode` - pub fn predict_inter_single( + pub fn predict_inter_single( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frame: RefType, mv: MotionVector, @@ -319,7 +318,7 @@ impl PredictionMode { { let (row_frac, col_frac, src) = PredictionMode::get_mv_params(&rec.frame.planes[p], frame_po, mv); - put_8tap( + put_8tap::<_, BD>( dst, src, width, @@ -328,7 +327,6 @@ impl PredictionMode { row_frac, mode, mode, - fi.sequence.bit_depth, fi.cpu_feature_level, ); } @@ -339,7 +337,7 @@ impl PredictionMode { /// # Panics /// /// - If called on an intra `PredictionMode` - pub fn predict_inter_compound( + pub fn predict_inter_compound( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2], @@ -359,7 +357,7 @@ impl PredictionMode { frame_po, mvs[i], ); - prep_8tap( + prep_8tap::<_, BD>( buffer.get_buffer_mut(i), src, width, @@ -368,25 +366,23 @@ impl PredictionMode { row_frac, mode, mode, - fi.sequence.bit_depth, fi.cpu_feature_level, ); } } - mc_avg( + mc_avg::<_, BD>( dst, buffer.get_buffer(0), buffer.get_buffer(1), width, height, - fi.sequence.bit_depth, fi.cpu_feature_level, ); } /// Inter prediction that determines whether compound mode is being used based /// on the second [`RefType`] in [`ref_frames`]. - pub fn predict_inter( + pub fn predict_inter( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2], @@ -396,7 +392,7 @@ impl PredictionMode { && ref_frames[1] != RefType::NONE_FRAME; if !is_compound { - self.predict_inter_single( + self.predict_inter_single::<_, BD>( fi, tile_rect, p, @@ -408,7 +404,7 @@ impl PredictionMode { mvs[0], ) } else { - self.predict_inter_compound( + self.predict_inter_compound::<_, BD>( fi, tile_rect, p, @@ -698,10 +694,10 @@ pub(crate) mod rust { use std::mem::size_of; #[inline(always)] - pub fn dispatch_predict_intra( + pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, - dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, - ac: &[i16], angle: isize, ief_params: Option, + dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16], + angle: isize, ief_params: Option, edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, _cpu: CpuFeatureLevel, ) { let width = tx_size.width(); @@ -718,11 +714,11 @@ pub(crate) mod rust { match mode { PredictionMode::DC_PRED => { (match variant { - PredictionVariant::NONE => pred_dc_128, + PredictionVariant::NONE => pred_dc_128::<_, BD>, PredictionVariant::LEFT => pred_dc_left, PredictionVariant::TOP => pred_dc_top, PredictionVariant::BOTH => pred_dc, - })(dst, above_slice, left_slice, width, height, bit_depth) + })(dst, above_slice, left_slice, width, height) } PredictionMode::V_PRED if angle == 90 => { pred_v(dst, above_slice, width, height) @@ -737,7 +733,7 @@ pub(crate) mod rust { | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED - | PredictionMode::D67_PRED => pred_directional( + | PredictionMode::D67_PRED => pred_directional::<_, BD>( dst, above_slice, left_and_left_below_slice, @@ -745,7 +741,6 @@ pub(crate) mod rust { angle as usize, width, height, - bit_depth, ief_params, ), PredictionMode::SMOOTH_PRED => { @@ -760,28 +755,23 @@ pub(crate) mod rust { PredictionMode::PAETH_PRED => { pred_paeth(dst, above_slice, left_slice, top_left[0], width, height) } - PredictionMode::UV_CFL_PRED => (match variant { - PredictionVariant::NONE => pred_cfl_128, - PredictionVariant::LEFT => pred_cfl_left, - PredictionVariant::TOP => pred_cfl_top, - PredictionVariant::BOTH => pred_cfl, - })( - dst, - ac, - angle as i16, - above_slice, - left_slice, - width, - height, - bit_depth, - ), + PredictionMode::UV_CFL_PRED => { + (match variant { + PredictionVariant::NONE => pred_cfl_128::<_, BD>, + PredictionVariant::LEFT => pred_cfl_left::<_, BD>, + PredictionVariant::TOP => pred_cfl_top::<_, BD>, + PredictionVariant::BOTH => pred_cfl::<_, BD>, + })( + dst, ac, angle as i16, above_slice, left_slice, width, height + ) + } _ => unimplemented!(), } } pub(crate) fn pred_dc( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, - height: usize, _bit_depth: usize, + height: usize, ) { let edges = left[..height].iter().chain(above[..width].iter()); let len = (width + height) as u32; @@ -797,11 +787,11 @@ pub(crate) mod rust { } } - pub(crate) fn pred_dc_128( + pub(crate) fn pred_dc_128( output: &mut PlaneRegionMut<'_, T>, _above: &[T], _left: &[T], - width: usize, height: usize, bit_depth: usize, + width: usize, height: usize, ) { - let v = T::cast_from(128u32 << (bit_depth - 8)); + let v = T::cast_from(128u32 << (BD - 8)); for line in output.rows_iter_mut().take(height) { line[..width].fill(v); } @@ -809,7 +799,7 @@ pub(crate) mod rust { pub(crate) fn pred_dc_left( output: &mut PlaneRegionMut<'_, T>, _above: &[T], left: &[T], - width: usize, height: usize, _bit_depth: usize, + width: usize, height: usize, ) { let sum = left[..].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); @@ -823,7 +813,7 @@ pub(crate) mod rust { pub(crate) fn pred_dc_top( output: &mut PlaneRegionMut<'_, T>, above: &[T], _left: &[T], - width: usize, height: usize, _bit_depth: usize, + width: usize, height: usize, ) { let sum = above[..width].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); @@ -1051,9 +1041,9 @@ pub(crate) mod rust { } } - pub(crate) fn pred_cfl_inner( + pub(crate) fn pred_cfl_inner( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, width: usize, - height: usize, bit_depth: usize, + height: usize, ) { if alpha == 0 { return; @@ -1063,7 +1053,7 @@ pub(crate) mod rust { assert!(output.plane_cfg.stride >= width); assert!(output.rows_iter().len() >= height); - let sample_max = (1 << bit_depth) - 1; + let sample_max = (1 << BD) - 1; let avg: i32 = output[0][0].into(); for (line, luma) in @@ -1077,43 +1067,43 @@ pub(crate) mod rust { } } - pub(crate) fn pred_cfl( + pub(crate) fn pred_cfl( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], - left: &[T], width: usize, height: usize, bit_depth: usize, + left: &[T], width: usize, height: usize, ) { - pred_dc(output, above, left, width, height, bit_depth); - pred_cfl_inner(output, ac, alpha, width, height, bit_depth); + pred_dc(output, above, left, width, height); + pred_cfl_inner::<_, BD>(output, ac, alpha, width, height); } - pub(crate) fn pred_cfl_128( + pub(crate) fn pred_cfl_128( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], - left: &[T], width: usize, height: usize, bit_depth: usize, + left: &[T], width: usize, height: usize, ) { - pred_dc_128(output, above, left, width, height, bit_depth); - pred_cfl_inner(output, ac, alpha, width, height, bit_depth); + pred_dc_128::<_, BD>(output, above, left, width, height); + pred_cfl_inner::<_, BD>(output, ac, alpha, width, height); } - pub(crate) fn pred_cfl_left( + pub(crate) fn pred_cfl_left( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], - left: &[T], width: usize, height: usize, bit_depth: usize, + left: &[T], width: usize, height: usize, ) { - pred_dc_left(output, above, left, width, height, bit_depth); - pred_cfl_inner(output, ac, alpha, width, height, bit_depth); + pred_dc_left(output, above, left, width, height); + pred_cfl_inner::<_, BD>(output, ac, alpha, width, height); } - pub(crate) fn pred_cfl_top( + pub(crate) fn pred_cfl_top( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], - left: &[T], width: usize, height: usize, bit_depth: usize, + left: &[T], width: usize, height: usize, ) { - pred_dc_top(output, above, left, width, height, bit_depth); - pred_cfl_inner(output, ac, alpha, width, height, bit_depth); + pred_dc_top(output, above, left, width, height); + pred_cfl_inner::<_, BD>(output, ac, alpha, width, height); } #[allow(clippy::clone_double_ref)] - pub(crate) fn pred_directional( + pub(crate) fn pred_directional( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], top_left: &[T], p_angle: usize, width: usize, height: usize, - bit_depth: usize, ief_params: Option, + ief_params: Option, ) { #[allow(clippy::collapsible_if)] #[allow(clippy::collapsible_else_if)] @@ -1223,7 +1213,7 @@ pub(crate) mod rust { edge.copy_from_slice(edge_filtered.as_slice()); } - fn upsample_edge(size: usize, edge: &mut [T], bit_depth: usize) { + fn upsample_edge(size: usize, edge: &mut [T]) { // The input edge should be valid in the -1..size range, // where the -1 index is the top-left edge pixel. Since // negative indices are unsafe in Rust, the caller is @@ -1247,14 +1237,14 @@ pub(crate) mod rust { + (9 * dup[i + 1].to_i32().unwrap()) + (9 * dup[i + 2].to_i32().unwrap()) - dup[i + 3].to_i32().unwrap(); - s = ((s + 8) / 16).clamp(0, (1 << bit_depth) - 1); + s = ((s + 8) / 16).clamp(0, (1 << BD) - 1); edge[2 * i + 1] = T::cast_from(s); edge[2 * i + 2] = dup[i + 2]; } } - let sample_max = (1 << bit_depth) - 1; + let sample_max = (1 << BD) - 1; let max_x = output.plane_cfg.width as isize - 1; let max_y = output.plane_cfg.height as isize - 1; @@ -1332,7 +1322,7 @@ pub(crate) mod rust { p_angle as isize - 90, ); if upsample_above { - upsample_edge(num_px.0, above_filtered.as_mut_slice(), bit_depth); + upsample_edge::<_, BD>(num_px.0, &mut above_filtered[..]); } upsample_left = select_ief_upsample( width, @@ -1341,7 +1331,7 @@ pub(crate) mod rust { p_angle as isize - 180, ); if upsample_left { - upsample_edge(num_px.1, left_filtered.as_mut_slice(), bit_depth); + upsample_edge::<_, BD>(num_px.1, &mut left_filtered[..]); } left_filtered.reverse(); @@ -1509,16 +1499,16 @@ mod test { let mut output = Plane::from_slice(&[0u8; 4 * 4], 4); - pred_dc(&mut output.as_region_mut(), above, left, 4, 4, 8); + pred_dc(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!(&output.data[..], [32u8; 16]); - pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4, 8); + pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!(&output.data[..], [35u8; 16]); - pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4, 8); + pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!(&output.data[..], [30u8; 16]); - pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4, 8); + pred_dc_128::<_, 8>(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!(&output.data[..], [128u8; 16]); pred_v(&mut output.as_region_mut(), above, 4, 4); @@ -1594,7 +1584,7 @@ mod test { [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36], ]; for (&angle, expected) in angles.iter().zip(expected.iter()) { - pred_directional( + pred_directional::<_, 8>( &mut output.as_region_mut(), above, left, @@ -1602,7 +1592,6 @@ mod test { angle, 4, 4, - 8, None, ); assert_eq!(&output.data[..], expected); @@ -1617,7 +1606,7 @@ mod test { let mut o = Plane::from_slice(&vec![0u16; 32 * 32], 32); - pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4, 16); + pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs index 72006361fb..2f6e2b103a 100644 --- a/src/quantize/mod.rs +++ b/src/quantize/mod.rs @@ -36,18 +36,24 @@ pub fn get_log_tx_scale(tx_size: TxSize) -> usize { + Into::::into(num_pixels > 1024) } -pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { - let dc_q: [&[NonZeroU16; 256]; 3] = - [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; - let bd = ((bit_depth ^ 8) >> 1).min(2); - dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] +pub fn dc_q(qindex: u8, delta_q: i8) -> NonZeroU16 { + let dc_q = match BD { + 8 => &dc_qlookup_Q3, + 10 => &dc_qlookup_10_Q3, + 12 => &dc_qlookup_12_Q3, + _ => unimplemented!(), + }; + dc_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } -pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { - let ac_q: [&[NonZeroU16; 256]; 3] = - [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; - let bd = ((bit_depth ^ 8) >> 1).min(2); - ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] +pub fn ac_q(qindex: u8, delta_q: i8) -> NonZeroU16 { + let ac_q = match BD { + 8 => &ac_qlookup_Q3, + 10 => &ac_qlookup_10_Q3, + 12 => &ac_qlookup_12_Q3, + _ => unimplemented!(), + }; + ac_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } // TODO: Handle lossless properly. @@ -78,8 +84,8 @@ fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 { } } -pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { - let qlookup = match bit_depth { +pub fn select_dc_qi(quantizer: i64) -> u8 { + let qlookup = match BD { 8 => &dc_qlookup_Q3, 10 => &dc_qlookup_10_Q3, 12 => &dc_qlookup_12_Q3, @@ -88,8 +94,8 @@ pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { select_qi(quantizer, qlookup) } -pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 { - let qlookup = match bit_depth { +pub fn select_ac_qi(quantizer: i64) -> u8 { + let qlookup = match BD { 8 => &ac_qlookup_Q3, 10 => &ac_qlookup_10_Q3, 12 => &ac_qlookup_12_Q3, @@ -218,16 +224,16 @@ mod test { } impl QuantizationContext { - pub fn update( - &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize, - dc_delta_q: i8, ac_delta_q: i8, + pub fn update( + &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, dc_delta_q: i8, + ac_delta_q: i8, ) { self.log_tx_scale = get_log_tx_scale(tx_size); - self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth); + self.dc_quant = dc_q::(qindex, dc_delta_q); self.dc_mul_add = divu_gen(self.dc_quant.into()); - self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth); + self.ac_quant = ac_q::(qindex, ac_delta_q); self.ac_mul_add = divu_gen(self.ac_quant.into()); // All of these biases were derived by measuring the cost of coding @@ -352,15 +358,15 @@ pub mod rust { use super::*; use crate::cpu_features::CpuFeatureLevel; - pub fn dequantize( + pub fn dequantize( qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize, - bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel, + dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel, ) { let log_tx_scale = get_log_tx_scale(tx_size) as i32; let offset = (1 << log_tx_scale) - 1; - let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32; - let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32; + let dc_quant = dc_q::(qindex, dc_delta_q).get() as i32; + let ac_quant = ac_q::(qindex, ac_delta_q).get() as i32; for (i, (r, c)) in rcoeffs .iter_mut() diff --git a/src/rate.rs b/src/rate.rs index 3ae7b09e1b..da0d98d636 100644 --- a/src/rate.rs +++ b/src/rate.rs @@ -522,15 +522,14 @@ fn chroma_offset( } impl QuantizerParameters { - fn new_from_log_q( - log_base_q: i64, log_target_q: i64, bit_depth: usize, - chroma_sampling: ChromaSampling, is_intra: bool, - log_isqrt_mean_scale: i64, + fn new_from_log_q( + log_base_q: i64, log_target_q: i64, chroma_sampling: ChromaSampling, + is_intra: bool, log_isqrt_mean_scale: i64, ) -> QuantizerParameters { - let scale = log_isqrt_mean_scale + q57(QSCALE + bit_depth as i32 - 8); + let scale = log_isqrt_mean_scale + q57(QSCALE + BD as i32 - 8); let mut log_q_y = log_target_q; - if !is_intra && bit_depth == 8 { + if !is_intra && BD == 8 { log_q_y = log_target_q + (log_target_q >> 32) * Q_MODEL_MUL[chroma_sampling as usize] + Q_MODEL_ADD[chroma_sampling as usize]; @@ -552,7 +551,7 @@ impl QuantizerParameters { let scale = |q| bexp64((log_target_q - q) * 2 + q57(16)) as f64 / 65536.; let dist_scale = [scale(log_q_y), scale(log_q_u), scale(log_q_v)]; - let base_q_idx = select_ac_qi(quantizer, bit_depth).max(1); + let base_q_idx = select_ac_qi::(quantizer).max(1); // delta_q only gets 6 bits + a sign bit, so it can differ by 63 at most. let min_qi = base_q_idx.saturating_sub(63).max(1); @@ -564,14 +563,14 @@ impl QuantizerParameters { log_target_q, // TODO: Allow lossless mode; i.e. qi == 0. dc_qi: [ - clamp_qi(select_dc_qi(quantizer, bit_depth)), - if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_u, bit_depth)) }, - if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_v, bit_depth)) }, + clamp_qi(select_dc_qi::(quantizer)), + if mono { 0 } else { clamp_qi(select_dc_qi::(quantizer_u)) }, + if mono { 0 } else { clamp_qi(select_dc_qi::(quantizer_v)) }, ], ac_qi: [ base_q_idx, - if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_u, bit_depth)) }, - if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_v, bit_depth)) }, + if mono { 0 } else { clamp_qi(select_ac_qi::(quantizer_u)) }, + if mono { 0 } else { clamp_qi(select_ac_qi::(quantizer_v)) }, ], lambda, dist_scale, @@ -701,17 +700,16 @@ impl RCState { } } - pub(crate) fn select_first_pass_qi( - &self, bit_depth: usize, fti: usize, chroma_sampling: ChromaSampling, + pub(crate) fn select_first_pass_qi( + &self, fti: usize, chroma_sampling: ChromaSampling, ) -> QuantizerParameters { // Adjust the quantizer for the frame type, result is Q57: let log_q = ((self.pass1_log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[fti] as i64) + DQP_Q57[fti]; - QuantizerParameters::new_from_log_q( + QuantizerParameters::new_from_log_q::( self.pass1_log_base_q, log_q, - bit_depth, chroma_sampling, fti == 0, 0, @@ -719,7 +717,7 @@ impl RCState { } // TODO: Separate quantizers for Cb and Cr. - pub(crate) fn select_qi( + pub(crate) fn select_qi( &self, ctx: &ContextInner, output_frameno: u64, fti: usize, maybe_prev_log_base_q: Option, log_isqrt_mean_scale: i64, ) -> QuantizerParameters { @@ -727,14 +725,12 @@ impl RCState { if self.target_bitrate <= 0 { // Rate control is not active. // Derive quantizer directly from frame type. - let bit_depth = ctx.config.bit_depth; let chroma_sampling = ctx.config.chroma_sampling; let (log_base_q, log_q) = - Self::calc_flat_quantizer(ctx.config.quantizer as u8, bit_depth, fti); - QuantizerParameters::new_from_log_q( + Self::calc_flat_quantizer::(ctx.config.quantizer as u8, fti); + QuantizerParameters::new_from_log_q::( log_base_q, log_q, - bit_depth, chroma_sampling, fti == 0, log_isqrt_mean_scale, @@ -748,11 +744,8 @@ impl RCState { match self.twopass_state { // First pass of 2-pass mode: use a fixed base quantizer. PASS_1 => { - return self.select_first_pass_qi( - ctx.config.bit_depth, - fti, - ctx.config.chroma_sampling, - ); + return self + .select_first_pass_qi::(fti, ctx.config.chroma_sampling); } // Second pass of 2-pass mode: we know exactly how much of each frame // type there is in the current buffer window, and have estimates for @@ -906,17 +899,16 @@ impl RCState { // in the binary log domain (binary exp and log aren't too bad): // rate = exp2(log2(scale) - log2(quantizer)*exp) // There's no easy closed form solution, so we bisection searh for it. - let bit_depth = ctx.config.bit_depth; let chroma_sampling = ctx.config.chroma_sampling; // TODO: Proper handling of lossless. - let mut log_qlo = blog64(ac_q(self.ac_qi_min, 0, bit_depth).get() as i64) - - q57(QSCALE + bit_depth as i32 - 8); + let mut log_qlo = blog64(ac_q::(self.ac_qi_min, 0).get() as i64) + - q57(QSCALE + BD as i32 - 8); // The AC quantizer tables map to values larger than the DC quantizer // tables, so we use that as the upper bound to make sure we can use // the full table if needed. let mut log_qhi = blog64( - ac_q(self.maybe_ac_qi_max.unwrap_or(255), 0, bit_depth).get() as i64, - ) - q57(QSCALE + bit_depth as i32 - 8); + ac_q::(self.maybe_ac_qi_max.unwrap_or(255), 0).get() as i64, + ) - q57(QSCALE + BD as i32 - 8); let mut log_base_q = (log_qlo + log_qhi) >> 1; while log_qlo < log_qhi { // Count bits contributed by each frame type using the model. @@ -1020,20 +1012,19 @@ impl RCState { if let Some(qi_max) = self.maybe_ac_qi_max { let (max_log_base_q, max_log_q) = - Self::calc_flat_quantizer(qi_max, ctx.config.bit_depth, fti); + Self::calc_flat_quantizer::(qi_max, fti); log_base_q = cmp::min(log_base_q, max_log_base_q); log_q = cmp::min(log_q, max_log_q); } if self.ac_qi_min > 0 { let (min_log_base_q, min_log_q) = - Self::calc_flat_quantizer(self.ac_qi_min, ctx.config.bit_depth, fti); + Self::calc_flat_quantizer::(self.ac_qi_min, fti); log_base_q = cmp::max(log_base_q, min_log_base_q); log_q = cmp::max(log_q, min_log_q); } - QuantizerParameters::new_from_log_q( + QuantizerParameters::new_from_log_q::( log_base_q, log_q, - bit_depth, chroma_sampling, fti == 0, log_isqrt_mean_scale, @@ -1043,8 +1034,8 @@ impl RCState { // Computes a quantizer directly from the frame type and base quantizer index, // without consideration for rate control. - fn calc_flat_quantizer( - base_qi: u8, bit_depth: usize, fti: usize, + fn calc_flat_quantizer( + base_qi: u8, fti: usize, ) -> (i64, i64) { // TODO: Rename "quantizer" something that indicates it is a quantizer // index, and move it somewhere more sensible (or choose a better way to @@ -1052,13 +1043,13 @@ impl RCState { // We use the AC quantizer as the source quantizer since its quantizer // tables have unique entries, while the DC tables do not. - let ac_quantizer = ac_q(base_qi, 0, bit_depth).get() as i64; + let ac_quantizer = ac_q::(base_qi, 0).get() as i64; // Pick the nearest DC entry since an exact match may be unavailable. - let dc_qi = select_dc_qi(ac_quantizer, bit_depth); - let dc_quantizer = dc_q(dc_qi, 0, bit_depth).get() as i64; + let dc_qi = select_dc_qi::(ac_quantizer); + let dc_quantizer = dc_q::(dc_qi, 0).get() as i64; // Get the log quantizers as Q57. - let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + bit_depth as i32 - 8); - let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + bit_depth as i32 - 8); + let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + BD as i32 - 8); + let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + BD as i32 - 8); // Target the midpoint of the chosen entries. let log_base_q = (log_ac_q + log_dc_q + 1) >> 1; // Adjust the quantizer for the frame type, result is Q57: @@ -1255,11 +1246,13 @@ impl RCState { cur_pos } - pub(crate) fn select_pass1_log_base_q( + pub(crate) fn select_pass1_log_base_q( &self, ctx: &ContextInner, output_frameno: u64, ) -> i64 { assert_eq!(self.twopass_state, PASS_SINGLE); - self.select_qi(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0).log_base_q + self + .select_qi::<_, BD>(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0) + .log_base_q } // Initialize the first pass and emit a placeholder summary diff --git a/src/rdo.rs b/src/rdo.rs index c92b383b92..1dfdba438d 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -139,9 +139,13 @@ pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 { } #[allow(unused)] -pub fn cdef_dist_wxh DistortionScale>( +pub fn cdef_dist_wxh< + T: Pixel, + F: Fn(Area, BlockSize) -> DistortionScale, + const BD: usize, +>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, - bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel, + compute_bias: F, cpu: CpuFeatureLevel, ) -> Distortion { debug_assert!(src1.plane_cfg.xdec == 0); debug_assert!(src1.plane_cfg.ydec == 0); @@ -155,12 +159,11 @@ pub fn cdef_dist_wxh DistortionScale>( let kernel_w = (w - x).min(8); let area = Area::StartingAt { x: x as isize, y: y as isize }; - let value = RawDistortion(cdef_dist_kernel( + let value = RawDistortion(cdef_dist_kernel::<_, BD>( &src1.subregion(area), &src2.subregion(area), kernel_w, kernel_h, - bit_depth, cpu, ) as u64); @@ -174,9 +177,13 @@ pub fn cdef_dist_wxh DistortionScale>( /// Sum of Squared Error for a wxh block /// Currently limited to w and h of valid blocks -pub fn sse_wxh DistortionScale>( +pub fn sse_wxh< + T: Pixel, + F: Fn(Area, BlockSize) -> DistortionScale, + const BD: usize, +>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, - compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, + compute_bias: F, cpu: CpuFeatureLevel, ) -> Distortion { // See get_weighted_sse in src/dist.rs. // Provide a scale to get_weighted_sse for each square region of this size. @@ -218,9 +225,7 @@ pub fn sse_wxh DistortionScale>( } } - Distortion(get_weighted_sse( - src1, src2, buf, buf_stride, w, h, bit_depth, cpu, - )) + Distortion(get_weighted_sse(src1, src2, buf, buf_stride, w, h, cpu)) } pub const fn clip_visible_bsize( @@ -249,7 +254,7 @@ pub const fn clip_visible_bsize( } // Compute the pixel-domain distortion for an encode -fn compute_distortion( +fn compute_distortion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, ) -> ScaledDistortion { @@ -272,12 +277,11 @@ fn compute_distortion( } let mut distortion = match fi.config.tune { - Tune::Psychovisual => cdef_dist_wxh( + Tune::Psychovisual => cdef_dist_wxh::<_, _, BD>( &input_region, &rec_region, visible_w, visible_h, - fi.sequence.bit_depth, |bias_area, bsize| { distortion_scale( fi, @@ -287,7 +291,7 @@ fn compute_distortion( }, fi.cpu_feature_level, ), - Tune::Psnr => sse_wxh( + Tune::Psnr => sse_wxh::<_, _, BD>( &input_region, &rec_region, visible_w, @@ -299,7 +303,6 @@ fn compute_distortion( bsize, ) }, - fi.sequence.bit_depth, fi.cpu_feature_level, ), } * fi.dist_scale[0]; @@ -323,7 +326,7 @@ fn compute_distortion( for p in 1..3 { let input_region = ts.input_tile.planes[p].subregion(area); let rec_region = ts.rec.planes[p].subregion(area); - distortion += sse_wxh( + distortion += sse_wxh::<_, _, BD>( &input_region, &rec_region, chroma_w, @@ -335,7 +338,6 @@ fn compute_distortion( bsize, ) }, - fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[p]; } @@ -344,7 +346,7 @@ fn compute_distortion( } // Compute the transform-domain distortion for an encode -fn compute_tx_distortion( +fn compute_tx_distortion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, skip: bool, luma_only: bool, @@ -372,7 +374,7 @@ fn compute_tx_distortion( } let mut distortion = if skip { - sse_wxh( + sse_wxh::<_, _, BD>( &input_region, &rec_region, visible_w, @@ -384,7 +386,6 @@ fn compute_tx_distortion( bsize, ) }, - fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[0] } else { @@ -411,7 +412,7 @@ fn compute_tx_distortion( for p in 1..3 { let input_region = ts.input_tile.planes[p].subregion(area); let rec_region = ts.rec.planes[p].subregion(area); - distortion += sse_wxh( + distortion += sse_wxh::<_, _, BD>( &input_region, &rec_region, chroma_w, @@ -423,7 +424,6 @@ fn compute_tx_distortion( bsize, ) }, - fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[p]; } @@ -720,7 +720,7 @@ pub fn compute_rd_cost( fi.lambda.mul_add(rate_in_bits, distortion.0 as f64) } -pub fn rdo_tx_size_type( +pub fn rdo_tx_size_type( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], @@ -759,7 +759,7 @@ pub fn rdo_tx_size_type( if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] }; // Luma plane transform type decision - let (tx_type, rd_cost) = rdo_tx_type_decision( + let (tx_type, rd_cost) = rdo_tx_type_decision::<_, BD>( fi, ts, cw, @@ -810,7 +810,7 @@ const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool { } #[inline] -fn luma_chroma_mode_rdo( +fn luma_chroma_mode_rdo( luma_mode: PredictionMode, fi: &FrameInvariants, bsize: BlockSize, tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, rdo_type: RDOType, @@ -857,7 +857,7 @@ fn luma_chroma_mode_rdo( for sidx in select_segment(fi, ts, tile_bo, bsize, skip) { cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx); - let (tx_size, tx_type) = rdo_tx_size_type( + let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>( fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip, ); for &chroma_mode in mode_set_chroma.iter() { @@ -878,7 +878,7 @@ fn luma_chroma_mode_rdo( luma_mode_is_intra && tx_size.block_size() != bsize; encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip); - let (has_coeff, tx_dist) = encode_block_post_cdef( + let (has_coeff, tx_dist) = encode_block_post_cdef::<_, _, BD>( fi, ts, cw, @@ -903,7 +903,7 @@ fn luma_chroma_mode_rdo( let rate = wr.tell_frac() - tell; let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel { - compute_tx_distortion( + compute_tx_distortion::<_, BD>( fi, ts, bsize, @@ -914,7 +914,14 @@ fn luma_chroma_mode_rdo( false, ) } else { - compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false) + compute_distortion::<_, BD>( + fi, + ts, + bsize, + is_chroma_block, + tile_bo, + false, + ) }; let is_zero_dist = distortion.0 == 0; let rd = compute_rd_cost(fi, rate, distortion); @@ -956,7 +963,7 @@ fn luma_chroma_mode_rdo( /// /// - If the best RD found is negative. /// This should never happen and indicates a development error. -pub fn rdo_mode_decision( +pub fn rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, @@ -975,7 +982,7 @@ pub fn rdo_mode_decision( let mut best = if fi.frame_type.has_inter() { assert!(fi.frame_type != FrameType::KEY); - inter_frame_rdo_mode_decision( + inter_frame_rdo_mode_decision::<_, BD>( fi, ts, cw, @@ -993,7 +1000,7 @@ pub fn rdo_mode_decision( has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); if !best.skip { - best = intra_frame_rdo_mode_decision( + best = intra_frame_rdo_mode_decision::<_, BD>( fi, ts, cw, @@ -1014,7 +1021,7 @@ pub fn rdo_mode_decision( let mut wr = WriterCounter::new(); let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 }; - write_tx_blocks( + write_tx_blocks::<_, _, BD>( fi, ts, cw, @@ -1034,7 +1041,9 @@ pub fn rdo_mode_decision( ); cw.rollback(&cw_checkpoint); if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { - if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) { + if let Some(cfl) = + rdo_cfl_alpha::<_, BD>(ts, tile_bo, bsize, best.tx_size, fi) + { let mut wr = WriterCounter::new(); let tell = wr.tell_frac(); @@ -1047,7 +1056,7 @@ pub fn rdo_mode_decision( tile_bo, best.skip, ); - let (has_coeff, _) = encode_block_post_cdef( + let (has_coeff, _) = encode_block_post_cdef::<_, _, BD>( fi, ts, cw, @@ -1073,8 +1082,14 @@ pub fn rdo_mode_decision( let rate = wr.tell_frac() - tell; // For CFL, tx-domain distortion is not an option. - let distortion = - compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false); + let distortion = compute_distortion::<_, BD>( + fi, + ts, + bsize, + is_chroma_block, + tile_bo, + false, + ); let rd = compute_rd_cost(fi, rate, distortion); if rd < best.rd_cost { best.rd_cost = rd; @@ -1113,7 +1128,7 @@ pub fn rdo_mode_decision( } } -fn inter_frame_rdo_mode_decision( +fn inter_frame_rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, @@ -1175,7 +1190,7 @@ fn inter_frame_rdo_mode_decision( pmv[1] = mv_stack[1].this_mv; } - let res = estimate_motion( + let res = estimate_motion::<_, BD>( fi, ts, bsize.width(), @@ -1320,7 +1335,7 @@ fn inter_frame_rdo_mode_decision( let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); - luma_mode.predict_inter( + luma_mode.predict_inter::<_, BD>( fi, tile_rect, 0, @@ -1337,12 +1352,11 @@ fn inter_frame_rdo_mode_decision( .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); let plane_ref = rec_region.as_const(); - let satd = get_satd( + let satd = get_satd::<_, BD>( &plane_org, &plane_ref, bsize.width(), bsize.height(), - fi.sequence.bit_depth, fi.cpu_feature_level, ); satds.push(satd); @@ -1361,7 +1375,7 @@ fn inter_frame_rdo_mode_decision( |&((luma_mode, i), mvs, _satd)| { let mode_set_chroma = ArrayVec::from([luma_mode]); - luma_chroma_mode_rdo( + luma_chroma_mode_rdo::<_, BD>( luma_mode, fi, bsize, @@ -1385,7 +1399,7 @@ fn inter_frame_rdo_mode_decision( best } -fn intra_frame_rdo_mode_decision( +fn intra_frame_rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, @@ -1432,7 +1446,7 @@ fn intra_frame_rdo_mode_decision( let rec = &ts.rec.planes[0].as_const(); let po = tile_bo.plane_offset(rec.plane_cfg); // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block - get_intra_edges( + get_intra_edges::<_, BD>( rec, tile_bo, 0, @@ -1440,7 +1454,6 @@ fn intra_frame_rdo_mode_decision( bsize, po, tx_size, - fi.sequence.bit_depth, None, fi.sequence.enable_intra_edge_filter, IntraParam::None, @@ -1466,11 +1479,10 @@ fn intra_frame_rdo_mode_decision( let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block - luma_mode.predict_intra( + luma_mode.predict_intra::<_, BD>( tile_rect, &mut rec_region, tx_size, - fi.sequence.bit_depth, &[0i16; 2], IntraParam::None, if luma_mode.is_directional() { ief_params } else { None }, @@ -1482,12 +1494,11 @@ fn intra_frame_rdo_mode_decision( .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); let plane_ref = rec_region.as_const(); - satds_all[luma_mode as usize] = get_satd( + satds_all[luma_mode as usize] = get_satd::<_, BD>( &plane_org, &plane_ref, tx_size.width(), tx_size.height(), - fi.sequence.bit_depth, fi.cpu_feature_level, ); } @@ -1507,7 +1518,7 @@ fn intra_frame_rdo_mode_decision( if is_chroma_block && luma_mode != PredictionMode::DC_PRED { mode_set_chroma.push(PredictionMode::DC_PRED); } - luma_chroma_mode_rdo( + luma_chroma_mode_rdo::<_, BD>( luma_mode, fi, bsize, @@ -1541,7 +1552,7 @@ fn intra_frame_rdo_mode_decision( let mut best_angle_delta = best.angle_delta; let mut angle_delta_rdo = |y, uv| -> AngleDelta { if best.angle_delta.y != y || best.angle_delta.uv != uv { - luma_chroma_mode_rdo( + luma_chroma_mode_rdo::<_, BD>( best.pred_mode_luma, fi, bsize, @@ -1581,7 +1592,7 @@ fn intra_frame_rdo_mode_decision( /// # Panics /// /// - If the block size is invalid for subsampling. -pub fn rdo_cfl_alpha( +pub fn rdo_cfl_alpha( ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, luma_tx_size: TxSize, fi: &FrameInvariants, ) -> Option { @@ -1613,7 +1624,7 @@ pub fn rdo_cfl_alpha( let rec = &mut ts.rec.planes[p]; let input = &ts.input_tile.planes[p]; let po = tile_bo.plane_offset(rec.plane_cfg); - let edge_buf = get_intra_edges( + let edge_buf = get_intra_edges::<_, BD>( &rec.as_const(), tile_bo, 0, @@ -1621,7 +1632,6 @@ pub fn rdo_cfl_alpha( bsize, po, uv_tx_size, - fi.sequence.bit_depth, Some(PredictionMode::UV_CFL_PRED), fi.sequence.enable_intra_edge_filter, IntraParam::None, @@ -1629,24 +1639,22 @@ pub fn rdo_cfl_alpha( let mut alpha_cost = |alpha: i16| -> u64 { let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); - PredictionMode::UV_CFL_PRED.predict_intra( + PredictionMode::UV_CFL_PRED.predict_intra::<_, BD>( tile_rect, &mut rec_region, uv_tx_size, - fi.sequence.bit_depth, &ac.data, IntraParam::Alpha(alpha), None, &edge_buf, fi.cpu_feature_level, ); - sse_wxh( + sse_wxh::<_, _, BD>( &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }), &rec_region.as_const(), visible_tx_w, visible_tx_h, |_, _| DistortionScale::default(), // We're not doing RDO here. - fi.sequence.bit_depth, fi.cpu_feature_level, ) .0 @@ -1688,7 +1696,7 @@ pub fn rdo_cfl_alpha( /// This should never happen and indicates a development error. /// - If the best RD found is negative. /// This should never happen and indicates a development error. -pub fn rdo_tx_type_decision( +pub fn rdo_tx_type_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, cw_checkpoint: &mut Option, mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], @@ -1726,7 +1734,7 @@ pub fn rdo_tx_type_decision( } if is_inter { - motion_compensate( + motion_compensate::<_, BD>( fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true, ); } @@ -1734,7 +1742,7 @@ pub fn rdo_tx_type_decision( let mut wr = WriterCounter::new(); let tell = wr.tell_frac(); let (_, tx_dist) = if is_inter { - write_tx_tree( + write_tx_tree::<_, _, BD>( fi, ts, cw, @@ -1751,7 +1759,7 @@ pub fn rdo_tx_type_decision( need_recon_pixel, ) } else { - write_tx_blocks( + write_tx_blocks::<_, _, BD>( fi, ts, cw, @@ -1773,7 +1781,7 @@ pub fn rdo_tx_type_decision( let rate = wr.tell_frac() - tell; let distortion = if fi.use_tx_domain_distortion { - compute_tx_distortion( + compute_tx_distortion::<_, BD>( fi, ts, bsize, @@ -1784,7 +1792,14 @@ pub fn rdo_tx_type_decision( true, ) } else { - compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true) + compute_distortion::<_, BD>( + fi, + ts, + bsize, + is_chroma_block, + tile_bo, + true, + ) }; cw.rollback(cw_checkpoint.as_ref().unwrap()); @@ -1836,14 +1851,14 @@ pub fn get_sub_partitions( } #[inline(always)] -fn rdo_partition_none( +fn rdo_partition_none( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec, ) -> f64 { debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); - let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); + let mode = rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg); let cost = mode.rd_cost; child_modes.push(mode); @@ -1853,7 +1868,7 @@ fn rdo_partition_none( // VERTICAL, HORIZONTAL or simple SPLIT #[inline(always)] -fn rdo_partition_simple( +fn rdo_partition_simple( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, @@ -1895,7 +1910,7 @@ fn rdo_partition_simple( if has_cols && has_rows { let mode_decision = - rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg); + rdo_mode_decision::<_, BD>(fi, ts, cw, subsize, offset, inter_cfg); rd_cost_sum += mode_decision.rd_cost; @@ -1907,7 +1922,7 @@ fn rdo_partition_simple( if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize); } - encode_block_with_modes( + encode_block_with_modes::<_, _, BD>( fi, ts, cw, @@ -1935,7 +1950,7 @@ fn rdo_partition_simple( /// /// - If the best RD found is negative. /// This should never happen, and indicates a development error. -pub fn rdo_partition_decision( +pub fn rdo_partition_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, @@ -1960,7 +1975,7 @@ pub fn rdo_partition_decision( let cost = match partition { PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => { - Some(rdo_partition_none( + Some(rdo_partition_none::<_, BD>( fi, ts, cw, @@ -1971,7 +1986,7 @@ pub fn rdo_partition_decision( )) } PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => { - rdo_partition_simple( + rdo_partition_simple::<_, _, BD>( fi, ts, cw, @@ -2012,7 +2027,7 @@ pub fn rdo_partition_decision( } } -fn rdo_loop_plane_error( +fn rdo_loop_plane_error( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants, ts: &TileStateMut<'_, T>, blocks: &TileBlocks<'_>, test: &Frame, src: &Tile<'_, T>, pli: usize, @@ -2054,23 +2069,21 @@ fn rdo_loop_plane_error( // For loop filters, We intentionally use cdef_dist even with // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a // significant negative impact on other metrics and visual quality. - RawDistortion(cdef_dist_kernel( + RawDistortion(cdef_dist_kernel::<_, BD>( &src_region, &test_region, 8, 8, - fi.sequence.bit_depth, fi.cpu_feature_level, ) as u64) * bias } else { - sse_wxh( + sse_wxh::<_, _, BD>( &src_region, &test_region, 8 >> xdec, 8 >> ydec, |_, _| bias, - fi.sequence.bit_depth, fi.cpu_feature_level, ) }; @@ -2088,7 +2101,7 @@ fn rdo_loop_plane_error( /// # Panics /// /// - If both CDEF and LRF are disabled. -pub fn rdo_loop_decision( +pub fn rdo_loop_decision( base_sbo: TileSuperBlockOffset, fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, deblock_p: bool, @@ -2285,7 +2298,7 @@ pub fn rdo_loop_decision( // Find a good deblocking filter solution for the passed in area. // This is not RDO of deblocking itself, merely a solution to get // better results from CDEF/LRF RDO. - let deblock_levels = deblock_filter_optimize( + let deblock_levels = deblock_filter_optimize::<_, _, BD>( fi, &rec_subset.as_tile(), &src_subset, @@ -2301,13 +2314,12 @@ pub fn rdo_loop_decision( deblock_copy.levels = deblock_levels; // finally, deblock the temp frame - deblock_filter_frame( + deblock_filter_frame::<_, BD>( &deblock_copy, &mut rec_subset.as_tile_mut(), &tileblocks_subset.as_const(), crop_w, crop_h, - fi.sequence.bit_depth, planes, ); } @@ -2335,7 +2347,7 @@ pub fn rdo_loop_decision( if cdef_work.is_some() { Some(( &rec_subset, - cdef_analyze_superblock_range( + cdef_analyze_superblock_range::<_, BD>( fi, &rec_subset, &tileblocks_subset.as_const(), @@ -2382,7 +2394,7 @@ pub fn rdo_loop_decision( let mut err = ScaledDistortion::zero(); let mut rate = 0; - cdef_filter_superblock( + cdef_filter_superblock::<_, BD>( fi, &rec_subset, &mut cdef_ref.as_tile_mut(), @@ -2418,7 +2430,7 @@ pub fn rdo_loop_decision( // We have a valid LRU, apply LRF, compute error match best_lrf[lru_y * lru_w[pli] + lru_x][pli] { RestorationFilter::None {} => { - err += rdo_loop_plane_error( + err += rdo_loop_plane_error::<_, BD>( base_sbo, loop_sbo, 1, @@ -2459,7 +2471,7 @@ pub fn rdo_loop_decision( &cdef_ref.planes[pli].slice(loop_po), &cdef_ref.planes[pli].slice(loop_po), ); - sgrproj_stripe_filter( + sgrproj_stripe_filter::<_, _, BD>( set, xqd, fi, @@ -2473,7 +2485,7 @@ pub fn rdo_loop_decision( height: vis_height, }), ); - err += rdo_loop_plane_error( + err += rdo_loop_plane_error::<_, BD>( base_sbo, loop_sbo, 1, @@ -2496,7 +2508,7 @@ pub fn rdo_loop_decision( } } else { // No actual LRU here, compute error directly from CDEF output. - err += rdo_loop_plane_error( + err += rdo_loop_plane_error::<_, BD>( base_sbo, loop_sbo, 1, @@ -2540,7 +2552,7 @@ pub fn rdo_loop_decision( // Keep cdef output up to date; we need it for restoration // both below and above (padding) - cdef_filter_superblock( + cdef_filter_superblock::<_, BD>( fi, rec_copy, &mut cdef_ref_tm, @@ -2605,7 +2617,7 @@ pub fn rdo_loop_decision( // Check the no filter option { - let err = rdo_loop_plane_error( + let err = rdo_loop_plane_error::<_, BD>( base_sbo, loop_sbo, lru_sb_w, @@ -2660,7 +2672,7 @@ pub fn rdo_loop_decision( for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity) { - let (xqd0, xqd1) = sgrproj_solve( + let (xqd0, xqd1) = sgrproj_solve::<_, BD>( set, fi, &ts.integral_buffer, @@ -2673,7 +2685,7 @@ pub fn rdo_loop_decision( let current_lrf = RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] }; if let RestorationFilter::Sgrproj { set, xqd } = current_lrf { - sgrproj_stripe_filter( + sgrproj_stripe_filter::<_, _, BD>( set, xqd, fi, @@ -2688,7 +2700,7 @@ pub fn rdo_loop_decision( }), ); } - let err = rdo_loop_plane_error( + let err = rdo_loop_plane_error::<_, BD>( base_sbo, loop_sbo, lru_sb_w, diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 7414f09d3b..037ac8aadd 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -86,8 +86,6 @@ pub struct SceneChangeDetector { score_deque: Vec, /// Number of pixels in scaled frame for fast mode pixels: usize, - /// The bit depth of the video. - bit_depth: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, encoder_config: EncoderConfig, @@ -147,7 +145,6 @@ impl SceneChangeDetector { deque_offset, score_deque, pixels, - bit_depth, cpu_feature_level, encoder_config, sequence, @@ -165,7 +162,7 @@ impl SceneChangeDetector { /// /// This will gracefully handle the first frame in the video as well. #[hawktracer(analyze_next_frame)] - pub fn analyze_next_frame( + pub fn analyze_next_frame( &mut self, frame_set: &[&Arc>], input_frameno: u64, previous_keyframe: u64, ) -> bool { @@ -196,9 +193,13 @@ impl SceneChangeDetector { && frame_set.len() > self.deque_offset + 1 && self.score_deque.is_empty() { - self.initialize_score_deque(frame_set, input_frameno, self.deque_offset); + self.initialize_score_deque::( + frame_set, + input_frameno, + self.deque_offset, + ); } else if self.score_deque.is_empty() { - self.initialize_score_deque( + self.initialize_score_deque::( frame_set, input_frameno, frame_set.len() - 1, @@ -209,7 +210,7 @@ impl SceneChangeDetector { // Running single frame comparison and adding it to deque // Decrease deque offset if there is no new frames if frame_set.len() > self.deque_offset + 1 { - self.run_comparison( + self.run_comparison::( frame_set[self.deque_offset].clone(), frame_set[self.deque_offset + 1].clone(), input_frameno + self.deque_offset as u64, @@ -219,7 +220,7 @@ impl SceneChangeDetector { } // Adaptive scenecut check - let (scenecut, score) = self.adaptive_scenecut(); + let (scenecut, score) = self.adaptive_scenecut::(); let scenecut = self.handle_min_max_intervals(distance).unwrap_or(scenecut); debug!( "[SC-Detect] Frame {}: Raw={:5.1} ImpBl={:5.1} Bwd={:5.1} Fwd={:5.1} Th={:.1} {}", @@ -253,12 +254,12 @@ impl SceneChangeDetector { } // Initially fill score deque with frame scores - fn initialize_score_deque( + fn initialize_score_deque( &mut self, frame_set: &[&Arc>], input_frameno: u64, init_len: usize, ) { for x in 0..init_len { - self.run_comparison( + self.run_comparison::( frame_set[x].clone(), frame_set[x + 1].clone(), input_frameno + x as u64, @@ -268,14 +269,14 @@ impl SceneChangeDetector { /// Runs scene change comparison beetween 2 given frames /// Insert result to start of score deque - fn run_comparison( + fn run_comparison( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: u64, ) { let mut result = if self.speed_mode == SceneDetectionSpeed::Fast { self.fast_scenecut(frame1, frame2) } else { - self.cost_scenecut(frame1, frame2, input_frameno) + self.cost_scenecut::(frame1, frame2, input_frameno) }; // Subtract the highest metric value of surrounding frames from the current one @@ -322,7 +323,7 @@ impl SceneChangeDetector { /// Compares current scene score to adapted threshold based on previous scores /// Value of current frame is offset by lookahead, if lookahead >=5 /// Returns true if current scene score is higher than adapted threshold - fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) { + fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) { let score = self.score_deque[self.deque_offset]; // We use the importance block algorithm's cost metrics as a secondary algorithm @@ -333,8 +334,7 @@ impl SceneChangeDetector { // the importance block algorithm is over the threshold either on this frame (hard scenecut) // or within the past few frames (pan). This helps filter out a few false positives // produced by the cost-based algorithm. - let imp_block_threshold = - IMP_BLOCK_DIFF_THRESHOLD * (self.bit_depth as f64) / 8.0; + let imp_block_threshold = IMP_BLOCK_DIFF_THRESHOLD * (BD as f64) / 8.0; if !&self.score_deque[self.deque_offset..] .iter() .any(|result| result.imp_block_cost >= imp_block_threshold) diff --git a/src/scenechange/standard.rs b/src/scenechange/standard.rs index 1f058271df..2f452164e9 100644 --- a/src/scenechange/standard.rs +++ b/src/scenechange/standard.rs @@ -18,7 +18,7 @@ impl SceneChangeDetector { /// We gather both intra and inter costs for the frames, /// as well as an importance-block-based difference, /// and use all three metrics. - pub(super) fn cost_scenecut( + pub(super) fn cost_scenecut( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: u64, ) -> ScenecutResult { @@ -49,10 +49,9 @@ impl SceneChangeDetector { let intra_costs = self.intra_costs.entry(input_frameno).or_insert_with(|| { - estimate_intra_costs( + estimate_intra_costs::<_, BD>( temp_plane, &*frame2, - self.bit_depth, self.cpu_feature_level, ) }); @@ -67,10 +66,9 @@ impl SceneChangeDetector { }; }); s.spawn(|_| { - mv_inter_cost = estimate_inter_costs( + mv_inter_cost = estimate_inter_costs::<_, BD>( frame2_inter_ref, frame1, - self.bit_depth, self.encoder_config.clone(), self.sequence.clone(), buffer, diff --git a/src/segmentation.rs b/src/segmentation.rs index 36ee42fb1c..776d90c265 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -19,7 +19,7 @@ use crate::FrameState; pub const MAX_SEGMENTS: usize = 8; -pub fn segmentation_optimize( +pub fn segmentation_optimize( fi: &FrameInvariants, fs: &mut FrameState, ) { assert!(fi.enable_segmentation); @@ -50,11 +50,11 @@ pub fn segmentation_optimize( } assert_ne!(min_segment, MAX_SEGMENTS); fs.segmentation.min_segment = min_segment as u8; - fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth); + fs.segmentation.update_threshold::(fi.base_q_idx); return; } - segmentation_optimize_inner(fi, fs, offset_lower_limit); + segmentation_optimize_inner::<_, BD>(fi, fs, offset_lower_limit); /* Figure out parameters */ fs.segmentation.preskip = false; @@ -73,7 +73,7 @@ pub fn segmentation_optimize( } // Select target quantizers for each segment by fitting to log(scale). -fn segmentation_optimize_inner( +fn segmentation_optimize_inner( fi: &FrameInvariants, fs: &mut FrameState, offset_lower_limit: i16, ) { use crate::quantize::{ac_q, select_ac_qi}; @@ -112,8 +112,7 @@ fn segmentation_optimize_inner( // See `distortion_scale_for` for more information. let compute_delta = |centroids: &[i16]| { use crate::util::{bexp64, blog64}; - let log2_base_ac_q_q57 = - blog64(ac_q(fi.base_q_idx, 0, fi.config.bit_depth).get().into()); + let log2_base_ac_q_q57 = blog64(ac_q::(fi.base_q_idx, 0).get().into()); centroids .iter() .rev() @@ -128,8 +127,7 @@ fn segmentation_optimize_inner( // and take the delta from the base quantizer index. .map(|q| { // Avoid going into lossless mode by never bringing qidx below 1. - select_ac_qi(q, fi.config.bit_depth).max(1) as i16 - - fi.base_q_idx as i16 + select_ac_qi::(q).max(1) as i16 - fi.base_q_idx as i16 }) .collect::>() }; @@ -155,7 +153,7 @@ fn segmentation_optimize_inner( data[SegLvl::SEG_LVL_ALT_Q as usize] = delta.max(offset_lower_limit); } - fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth); + fs.segmentation.update_threshold::(fi.base_q_idx); } pub fn select_segment( diff --git a/src/transform/forward.rs b/src/transform/forward.rs index ac9f4e850b..d50ad8435a 100644 --- a/src/transform/forward.rs +++ b/src/transform/forward.rs @@ -98,9 +98,9 @@ pub mod rust { /// /// - If called with an invalid combination of `tx_size` and `tx_type` #[cold_for_target_arch("x86_64")] - pub fn forward_transform( + pub fn forward_transform( input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, + tx_type: TxType, _cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); @@ -117,7 +117,7 @@ pub mod rust { let mut tmp: Aligned<[i32; 64 * 64]> = unsafe { Aligned::uninitialized() }; let buf = &mut tmp.data[..txfm_size_col * txfm_size_row]; - let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); + let cfg = Txfm2DFlipCfg::fwd::(tx_type, tx_size); let txfm_func_col = get_func(cfg.txfm_type_col); let txfm_func_row = get_func(cfg.txfm_type_row); diff --git a/src/transform/forward_shared.rs b/src/transform/forward_shared.rs index 2c818fb89e..221b99f4d3 100644 --- a/src/transform/forward_shared.rs +++ b/src/transform/forward_shared.rs @@ -119,7 +119,7 @@ impl Txfm2DFlipCfg { /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` - pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self { + pub fn fwd(tx_type: TxType, tx_size: TxSize) -> Self { let tx_type_1d_col = VTX_TAB[tx_type as usize]; let tx_type_1d_row = HTX_TAB[tx_type as usize]; let txw_idx = tx_size.width_index(); @@ -134,7 +134,7 @@ impl Txfm2DFlipCfg { tx_size, ud_flip, lr_flip, - shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2], + shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(BD - 8) / 2], txfm_type_col, txfm_type_row, } diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs index cfe136352b..54c0a2d575 100644 --- a/src/transform/inverse.rs +++ b/src/transform/inverse.rs @@ -1602,9 +1602,9 @@ pub(crate) mod rust { use std::cmp; #[cold_for_target_arch("x86_64", "aarch64")] - pub fn inverse_transform_add( + pub fn inverse_transform_add( input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize, - tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, + tx_size: TxSize, tx_type: TxType, _cpu: CpuFeatureLevel, ) { let width: usize = tx_size.width(); let height: usize = tx_size.height(); @@ -1619,7 +1619,7 @@ pub(crate) mod rust { let tx_types_1d = get_1d_tx_types(tx_type); // perform inv txfm on every row - let range = bd + 8; + let range = BD + 8; let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3]; // 64 point transforms only signal 32 coeffs. We only take chunks of 32 // and skip over the last 32 transforms here. @@ -1645,7 +1645,7 @@ pub(crate) mod rust { } // perform inv txfm on every col - let range = cmp::max(bd + 6, 16); + let range = cmp::max(BD + 6, 16); let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3]; for c in 0..width { let mut temp_in: [i32; 64] = [0; 64]; @@ -1664,7 +1664,7 @@ pub(crate) mod rust { .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height)) { let v: i32 = (*out).as_(); - let v = clamp(v + round_shift(*temp, 4), 0, (1 << bd) - 1); + let v = clamp(v + round_shift(*temp, 4), 0, (1 << BD) - 1); *out = T::cast_from(v); } } diff --git a/src/transform/mod.rs b/src/transform/mod.rs index d14913e133..d05dc6dfe5 100644 --- a/src/transform/mod.rs +++ b/src/transform/mod.rs @@ -450,7 +450,7 @@ mod test { use crate::frame::*; use rand::random; - fn test_roundtrip( + fn test_roundtrip( tx_size: TxSize, tx_type: TxType, tolerance: i16, ) { let cpu = CpuFeatureLevel::default(); @@ -474,14 +474,20 @@ mod test { *d = T::cast_from(random::()); *r = i16::cast_from(*s) - i16::cast_from(*d); } - forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu); - inverse_transform_add( + forward_transform::<_, BD>( + res, + freq, + tx_size.width(), + tx_size, + tx_type, + cpu, + ); + inverse_transform_add::<_, BD>( freq, &mut dst.as_region_mut(), coeff_area, tx_size, tx_type, - 8, cpu, ); @@ -526,7 +532,7 @@ mod test { } } - fn roundtrips() { + fn roundtrips() { let combinations = [ (TX_4X4, DCT_DCT, 0), (TX_4X4, ADST_DCT, 0), @@ -577,17 +583,17 @@ mod test { ]; for &(tx_size, tx_type, tolerance) in combinations.iter() { println!("Testing combination {:?}, {:?}", tx_size, tx_type); - test_roundtrip::(tx_size, tx_type, tolerance); + test_roundtrip::(tx_size, tx_type, tolerance); } } #[test] fn roundtrips_u8() { - roundtrips::(); + roundtrips::(); } #[test] fn roundtrips_u16() { - roundtrips::(); + roundtrips::(); } }