From cf61536c0bb11bd4001278bd0cdad841a646d1eb Mon Sep 17 00:00:00 2001
From: Josh Holmer <jholmer.in@gmail.com>
Date: Mon, 30 Jan 2023 09:15:02 -0500
Subject: [PATCH 1/2] Use generics for bit depth throughout the encoder

This is a followup from #3116 which expands this optimization
to as many places in the encoder as we can reasonably utilize it.
By using generics, there are places where the compiler is able to
simplify math operations at compile time as well as areas where the
compiler is able to remove branches so that we only branch on bit depth
at the highest level of the code (and therefore the fewest number of
times).

Based on hyperfine benchmarking, this results in a 1-2% speedup across
the encoding process, although it does increase the final binary size.
---
 src/activity.rs                     |  25 +-
 src/api/config/mod.rs               |  11 +-
 src/api/context.rs                  |  14 +-
 src/api/internal.rs                 |  95 ++++---
 src/api/lookahead.rs                |  28 +-
 src/asm/aarch64/cdef.rs             |   5 +-
 src/asm/shared/transform/inverse.rs |  16 +-
 src/asm/x86/cdef.rs                 |  16 +-
 src/asm/x86/dist/cdef_dist.rs       |  32 ++-
 src/asm/x86/dist/mod.rs             |  20 +-
 src/asm/x86/dist/sse.rs             |   7 +-
 src/asm/x86/mc.rs                   |  50 ++--
 src/asm/x86/predict.rs              |  15 +-
 src/asm/x86/quantize.rs             |  31 +--
 src/asm/x86/transform/forward.rs    |  22 +-
 src/asm/x86/transform/inverse.rs    |  18 +-
 src/cdef.rs                         |  37 +--
 src/deblock.rs                      | 401 +++++++++++++---------------
 src/dist.rs                         |  37 +--
 src/encoder.rs                      | 181 ++++++-------
 src/lrf.rs                          |  85 ++----
 src/mc.rs                           |  26 +-
 src/me.rs                           | 257 +++++++-----------
 src/partition.rs                    |   5 +-
 src/predict.rs                      | 143 +++++-----
 src/quantize/mod.rs                 |  52 ++--
 src/rate.rs                         |  81 +++---
 src/rdo.rs                          | 196 +++++++-------
 src/scenechange/mod.rs              |  30 +--
 src/scenechange/standard.rs         |   8 +-
 src/segmentation.rs                 |  16 +-
 src/transform/forward.rs            |   6 +-
 src/transform/forward_shared.rs     |   4 +-
 src/transform/inverse.rs            |  10 +-
 src/transform/mod.rs                |  10 +-
 35 files changed, 922 insertions(+), 1068 deletions(-)
diff --git a/src/activity.rs b/src/activity.rs
index 5a8400d978..79a634540f 100644
--- a/src/activity.rs
+++ b/src/activity.rs
@@ -56,11 +56,11 @@ impl ActivityMask {
   }
 
   #[hawktracer(activity_mask_fill_scales)]
-  pub fn fill_scales(
-    &self, bit_depth: usize, activity_scales: &mut Box<[DistortionScale]>,
+  pub fn fill_scales<const BD: usize>(
+    &self, activity_scales: &mut Box<[DistortionScale]>,
   ) {
     for (dst, &src) in activity_scales.iter_mut().zip(self.variances.iter()) {
-      *dst = ssim_boost(src, src, bit_depth);
+      *dst = ssim_boost::<BD>(src, src);
     }
   }
 }
@@ -146,21 +146,20 @@ fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput {
 }
 
 #[inline(always)]
-pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale {
-  DistortionScale(apply_ssim_boost(
+pub fn ssim_boost<const BD: usize>(svar: u32, dvar: u32) -> DistortionScale {
+  DistortionScale(apply_ssim_boost::<BD>(
     DistortionScale::default().0,
     svar,
     dvar,
-    bit_depth,
   ))
 }
 
 /// Apply ssim boost to a given input
 #[inline(always)]
-pub fn apply_ssim_boost(
-  input: u32, svar: u32, dvar: u32, bit_depth: usize,
+pub fn apply_ssim_boost<const BD: usize>(
+  input: u32, svar: u32, dvar: u32,
 ) -> u32 {
-  let coeff_shift = bit_depth - 8;
+  let coeff_shift = BD - 8;
 
   // Scale dvar and svar to lbd range to prevent overflows.
   let svar = (svar >> (2 * coeff_shift)) as u64;
@@ -199,7 +198,7 @@ mod ssim_boost_tests {
     let max_pix_diff = (1 << 12) - 1;
     let max_pix_sse = max_pix_diff * max_pix_diff;
     let max_variance = max_pix_diff * 8 * 8 / 4;
-    apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12);
+    apply_ssim_boost::<12>(max_pix_sse * 8 * 8, max_variance, max_variance);
   }
 
   /// Floating point reference version of `ssim_boost`
@@ -234,8 +233,8 @@ mod ssim_boost_tests {
         let dvar = rng.gen_range(0..(1 << scale));
 
         let float = reference_ssim_boost(svar, dvar, 12);
-        let fixed =
-          apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64;
+        let fixed = apply_ssim_boost::<12>(1 << 23, svar, dvar) as f64
+          / (1 << 23) as f64;
 
         // Compare the two versions
         max_relative_error =
@@ -260,7 +259,7 @@ mod ssim_boost_tests {
     for svar in scale..(scale << 2) {
       let float = ((scale << 1) as f64 / svar as f64).cbrt();
       let fixed =
-        apply_ssim_boost(1 << 23, svar, svar, bd) as f64 / (1 << 23) as f64;
+        apply_ssim_boost(1 << 23, svar, svar) as f64 / (1 << 23) as f64;
 
       // Compare the two versions
       max_relative_error =
diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs
index fbb5ad3e5b..42f3b211a3 100644
--- a/src/api/config/mod.rs
+++ b/src/api/config/mod.rs
@@ -248,8 +248,15 @@ impl Config {
     // First-pass parameters depend on whether second-pass is in effect.
     // So `init_first_pass` must follow `init_second_pass`.
     if self.rate_control.emit_pass_data {
-      let maybe_pass1_log_base_q = (self.rate_control.summary.is_none())
-        .then(|| inner.rc_state.select_pass1_log_base_q(&inner, 0));
+      let maybe_pass1_log_base_q =
+        (self.rate_control.summary.is_none()).then(|| {
+          match self.enc.bit_depth {
+            8 => inner.rc_state.select_pass1_log_base_q::<_, 8>(&inner, 0),
+            10 => inner.rc_state.select_pass1_log_base_q::<_, 10>(&inner, 0),
+            12 => inner.rc_state.select_pass1_log_base_q::<_, 12>(&inner, 0),
+            _ => unimplemented!(),
+          }
+        });
       inner.rc_state.init_first_pass(maybe_pass1_log_base_q);
     }
 
diff --git a/src/api/context.rs b/src/api/context.rs
index 58d697fd61..9366c53fbb 100644
--- a/src/api/context.rs
+++ b/src/api/context.rs
@@ -129,7 +129,12 @@ impl<T: Pixel> Context<T> {
     }
 
     let inner = &mut self.inner;
-    let run = move || inner.send_frame(frame, params);
+    let run = move || match inner.config.bit_depth {
+      8 => inner.send_frame::<8>(frame, params),
+      10 => inner.send_frame::<10>(frame, params),
+      12 => inner.send_frame::<12>(frame, params),
+      _ => unimplemented!(),
+    };
 
     match &self.pool {
       Some(pool) => pool.install(run),
@@ -302,7 +307,12 @@ impl<T: Pixel> Context<T> {
   #[inline]
   pub fn receive_packet(&mut self) -> Result<Packet<T>, EncoderStatus> {
     let inner = &mut self.inner;
-    let mut run = move || inner.receive_packet();
+    let mut run = move || match inner.config.bit_depth {
+      8 => inner.receive_packet::<8>(),
+      10 => inner.receive_packet::<10>(),
+      12 => inner.receive_packet::<12>(),
+      _ => unimplemented!(),
+    };
 
     match &self.pool {
       Some(pool) => pool.install(run),
diff --git a/src/api/internal.rs b/src/api/internal.rs
index 1a978de836..5c379d5a86 100644
--- a/src/api/internal.rs
+++ b/src/api/internal.rs
@@ -317,7 +317,7 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(send_frame)]
-  pub fn send_frame(
+  pub fn send_frame<const BD: usize>(
     &mut self, mut frame: Option<Arc<Frame<T>>>,
     params: Option<FrameParameters>,
   ) -> Result<(), EncoderStatus> {
@@ -376,7 +376,7 @@ impl<T: Pixel> ContextInner<T> {
             break;
           }
 
-          Self::compute_keyframe_placement(
+          Self::compute_keyframe_placement::<BD>(
             cur_lookahead_frames,
             &self.keyframes_forced,
             &mut self.keyframe_detector,
@@ -385,7 +385,7 @@ impl<T: Pixel> ContextInner<T> {
           );
         }
       } else {
-        Self::compute_keyframe_placement(
+        Self::compute_keyframe_placement::<BD>(
           &lookahead_frames,
           &self.keyframes_forced,
           &mut self.keyframe_detector,
@@ -395,7 +395,7 @@ impl<T: Pixel> ContextInner<T> {
       }
     }
 
-    self.compute_frame_invariants();
+    self.compute_frame_invariants::<BD>();
 
     Ok(())
   }
@@ -649,7 +649,9 @@ impl<T: Pixel> ContextInner<T> {
   /// function must be called after every new `FrameInvariants` is initially
   /// computed.
   #[hawktracer(compute_lookahead_motion_vectors)]
-  fn compute_lookahead_motion_vectors(&mut self, output_frameno: u64) {
+  fn compute_lookahead_motion_vectors<const BD: usize>(
+    &mut self, output_frameno: u64,
+  ) {
     let frame_data = self.frame_data.get(&output_frameno).unwrap();
 
     // We're only interested in valid frames which are not show-existing-frame.
@@ -665,7 +667,7 @@ impl<T: Pixel> ContextInner<T> {
 
     let qps = {
       let fti = frame_data.as_ref().unwrap().fi.get_frame_subtype();
-      self.rc_state.select_qi(
+      self.rc_state.select_qi::<_, BD>(
         self,
         output_frameno,
         fti,
@@ -742,14 +744,14 @@ impl<T: Pixel> ContextInner<T> {
     fi.rec_buffer = coded_data.lookahead_rec_buffer.clone();
 
     // Estimate lambda with rate-control dry-run
-    fi.set_quantizers(&qps);
+    fi.set_quantizers::<BD>(&qps);
 
     // TODO: as in the encoding code, key frames will have no references.
     // However, for block importance purposes we want key frames to act as
     // P-frames in this instance.
     //
     // Compute the motion vectors.
-    compute_motion_vectors(fi, fs, &self.inter_cfg);
+    compute_motion_vectors::<_, BD>(fi, fs, &self.inter_cfg);
 
     let coded_data = fi.coded_frame_data.as_mut().unwrap();
 
@@ -818,7 +820,9 @@ impl<T: Pixel> ContextInner<T> {
   /// Computes lookahead intra cost approximations and fills in
   /// `lookahead_intra_costs` on the `FrameInvariants`.
   #[hawktracer(compute_lookahead_intra_costs)]
-  fn compute_lookahead_intra_costs(&mut self, output_frameno: u64) {
+  fn compute_lookahead_intra_costs<const BD: usize>(
+    &mut self, output_frameno: u64,
+  ) {
     let frame_data = self.frame_data.get(&output_frameno).unwrap();
     let fd = &frame_data.as_ref();
 
@@ -853,23 +857,22 @@ impl<T: Pixel> ContextInner<T> {
 
         // We use the cached values from scenechange if available,
         // otherwise we need to calculate them here.
-        estimate_intra_costs(
+        estimate_intra_costs::<_, BD>(
           temp_plane,
           &**frame,
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         )
       });
   }
 
   #[hawktracer(compute_keyframe_placement)]
-  pub fn compute_keyframe_placement(
+  pub fn compute_keyframe_placement<const BD: usize>(
     lookahead_frames: &[&Arc<Frame<T>>], keyframes_forced: &BTreeSet<u64>,
     keyframe_detector: &mut SceneChangeDetector<T>,
     next_lookahead_frame: &mut u64, keyframes: &mut BTreeSet<u64>,
   ) {
     if keyframes_forced.contains(next_lookahead_frame)
-      || keyframe_detector.analyze_next_frame(
+      || keyframe_detector.analyze_next_frame::<BD>(
         lookahead_frames,
         *next_lookahead_frame,
         *keyframes.iter().last().unwrap(),
@@ -882,24 +885,26 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(compute_frame_invariants)]
-  pub fn compute_frame_invariants(&mut self) {
+  pub fn compute_frame_invariants<const BD: usize>(&mut self) {
     while self.set_frame_properties(self.next_lookahead_output_frameno).is_ok()
     {
-      self
-        .compute_lookahead_motion_vectors(self.next_lookahead_output_frameno);
+      self.compute_lookahead_motion_vectors::<BD>(
+        self.next_lookahead_output_frameno,
+      );
       if self.config.temporal_rdo() {
-        self.compute_lookahead_intra_costs(self.next_lookahead_output_frameno);
+        self.compute_lookahead_intra_costs::<BD>(
+          self.next_lookahead_output_frameno,
+        );
       }
       self.next_lookahead_output_frameno += 1;
     }
   }
 
   #[hawktracer(update_block_importances)]
-  fn update_block_importances(
+  fn update_block_importances<const BD: usize>(
     fi: &FrameInvariants<T>, me_stats: &crate::me::FrameMEStats,
-    frame: &Frame<T>, reference_frame: &Frame<T>, bit_depth: usize,
-    bsize: BlockSize, len: usize,
-    reference_frame_block_importances: &mut [f32],
+    frame: &Frame<T>, reference_frame: &Frame<T>, bsize: BlockSize,
+    len: usize, reference_frame_block_importances: &mut [f32],
   ) {
     let coded_data = fi.coded_frame_data.as_ref().unwrap();
     let plane_org = &frame.planes[0];
@@ -946,12 +951,11 @@ impl<T: Pixel> ContextInner<T> {
                 height: IMPORTANCE_BLOCK_SIZE,
               });
 
-              let inter_cost = get_satd(
+              let inter_cost = get_satd::<_, BD>(
                 &region_org,
                 &region_ref,
                 bsize.width(),
                 bsize.height(),
-                bit_depth,
                 fi.cpu_feature_level,
               ) as f32;
 
@@ -1058,7 +1062,7 @@ impl<T: Pixel> ContextInner<T> {
 
   /// Computes the block importances for the current output frame.
   #[hawktracer(compute_block_importances)]
-  fn compute_block_importances(&mut self) {
+  fn compute_block_importances<const BD: usize>(&mut self) {
     // SEF don't need block importances.
     if self.frame_data[&self.output_frameno]
       .as_ref()
@@ -1142,7 +1146,6 @@ impl<T: Pixel> ContextInner<T> {
           }
         }
 
-        let bit_depth = self.config.bit_depth;
         let frame_data = &mut self.frame_data;
         let len = unique_indices.len();
 
@@ -1178,12 +1181,11 @@ impl<T: Pixel> ContextInner<T> {
                 .block_importances
             })
           {
-            Self::update_block_importances(
+            Self::update_block_importances::<BD>(
               fi,
               me_stats,
               frame,
               reference_frame,
-              bit_depth,
               bsize,
               len,
               reference_frame_block_importances,
@@ -1244,7 +1246,7 @@ impl<T: Pixel> ContextInner<T> {
     }
   }
 
-  pub(crate) fn encode_packet(
+  pub(crate) fn encode_packet<const BD: usize>(
     &mut self, cur_output_frameno: u64,
   ) -> Result<Packet<T>, EncoderStatus> {
     if self
@@ -1325,10 +1327,9 @@ impl<T: Pixel> ContextInner<T> {
             self.frame_q[&frame_data.fi.input_frameno].as_ref().unwrap();
           coded_data.activity_mask =
             ActivityMask::from_plane(&frame.planes[0]);
-          coded_data.activity_mask.fill_scales(
-            frame_data.fi.sequence.bit_depth,
-            &mut coded_data.activity_scales,
-          );
+          coded_data
+            .activity_mask
+            .fill_scales::<BD>(&mut coded_data.activity_scales);
           log_isqrt_mean_scale = coded_data.compute_spatiotemporal_scores();
         } else {
           coded_data.activity_mask = ActivityMask::default();
@@ -1359,19 +1360,22 @@ impl<T: Pixel> ContextInner<T> {
       }
 
       let fti = frame_data.fi.get_frame_subtype();
-      let qps = self.rc_state.select_qi(
+      let qps = self.rc_state.select_qi::<_, BD>(
         self,
         cur_output_frameno,
         fti,
         self.maybe_prev_log_base_q,
         log_isqrt_mean_scale,
       );
-      frame_data.fi.set_quantizers(&qps);
+      frame_data.fi.set_quantizers::<BD>(&qps);
 
       if self.rc_state.needs_trial_encode(fti) {
         let mut trial_fs = frame_data.fs.clone();
-        let data =
-          encode_frame(&frame_data.fi, &mut trial_fs, &self.inter_cfg);
+        let data = encode_frame::<_, BD>(
+          &frame_data.fi,
+          &mut trial_fs,
+          &self.inter_cfg,
+        );
         self.rc_state.update_state(
           (data.len() * 8) as i64,
           fti,
@@ -1380,18 +1384,21 @@ impl<T: Pixel> ContextInner<T> {
           true,
           false,
         );
-        let qps = self.rc_state.select_qi(
+        let qps = self.rc_state.select_qi::<_, BD>(
           self,
           cur_output_frameno,
           fti,
           self.maybe_prev_log_base_q,
           log_isqrt_mean_scale,
         );
-        frame_data.fi.set_quantizers(&qps);
+        frame_data.fi.set_quantizers::<BD>(&qps);
       }
 
-      let data =
-        encode_frame(&frame_data.fi, &mut frame_data.fs, &self.inter_cfg);
+      let data = encode_frame::<_, BD>(
+        &frame_data.fi,
+        &mut frame_data.fs,
+        &self.inter_cfg,
+      );
       #[cfg(feature = "dump_lookahead_data")]
       {
         let input_frameno = frame_data.fi.input_frameno;
@@ -1488,7 +1495,9 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(receive_packet)]
-  pub fn receive_packet(&mut self) -> Result<Packet<T>, EncoderStatus> {
+  pub fn receive_packet<const BD: usize>(
+    &mut self,
+  ) -> Result<Packet<T>, EncoderStatus> {
     if self.done_processing() {
       return Err(EncoderStatus::LimitReached);
     }
@@ -1514,12 +1523,12 @@ impl<T: Pixel> ContextInner<T> {
 
     if self.config.temporal_rdo() {
       // Compute the block importances for the current output frame.
-      self.compute_block_importances();
+      self.compute_block_importances::<BD>();
     }
 
     let cur_output_frameno = self.output_frameno;
 
-    let mut ret = self.encode_packet(cur_output_frameno);
+    let mut ret = self.encode_packet::<BD>(cur_output_frameno);
 
     if let Ok(ref mut pkt) = ret {
       self.garbage_collect(pkt.input_frameno);
diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs
index 2758d5920b..81a495d7d7 100644
--- a/src/api/lookahead.rs
+++ b/src/api/lookahead.rs
@@ -27,8 +27,8 @@ pub(crate) const IMP_BLOCK_AREA_IN_MV_UNITS: i64 =
   IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS;
 
 #[hawktracer(estimate_intra_costs)]
-pub(crate) fn estimate_intra_costs<T: Pixel>(
-  temp_plane: &mut Plane<T>, frame: &Frame<T>, bit_depth: usize,
+pub(crate) fn estimate_intra_costs<T: Pixel, const BD: usize>(
+  temp_plane: &mut Plane<T>, frame: &Frame<T>,
   cpu_feature_level: CpuFeatureLevel,
 ) -> Box<[u32]> {
   let plane = &frame.planes[0];
@@ -54,7 +54,7 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
       });
 
       // TODO: other intra prediction modes.
-      let edge_buf = get_intra_edges(
+      let edge_buf = get_intra_edges::<_, BD>(
         &plane.as_region(),
         TileBlockOffset(BlockOffset { x, y }),
         0,
@@ -65,7 +65,6 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           y: (y * IMPORTANCE_BLOCK_SIZE) as isize,
         },
         TxSize::TX_8X8,
-        bit_depth,
         Some(PredictionMode::DC_PRED),
         false,
         IntraParam::None,
@@ -79,7 +78,7 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           height: IMPORTANCE_BLOCK_SIZE,
         });
 
-      PredictionMode::DC_PRED.predict_intra(
+      PredictionMode::DC_PRED.predict_intra::<_, BD>(
         TileRect {
           x: x * IMPORTANCE_BLOCK_SIZE,
           y: y * IMPORTANCE_BLOCK_SIZE,
@@ -88,7 +87,6 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
         },
         &mut plane_after_prediction_region,
         tx_size,
-        bit_depth,
         &[], // Not used by DC_PRED
         IntraParam::None,
         None, // Not used by DC_PRED
@@ -104,12 +102,11 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           height: IMPORTANCE_BLOCK_SIZE,
         });
 
-      let intra_cost = get_satd(
+      let intra_cost = get_satd::<_, BD>(
         &plane_org,
         &plane_after_prediction_region,
         bsize.width(),
         bsize.height(),
-        bit_depth,
         cpu_feature_level,
       );
 
@@ -177,9 +174,9 @@ pub(crate) fn estimate_importance_block_difference<T: Pixel>(
 }
 
 #[hawktracer(estimate_inter_costs)]
-pub(crate) fn estimate_inter_costs<T: Pixel>(
-  frame: Arc<Frame<T>>, ref_frame: Arc<Frame<T>>, bit_depth: usize,
-  mut config: EncoderConfig, sequence: Arc<Sequence>, buffer: RefMEStats,
+pub(crate) fn estimate_inter_costs<T: Pixel, const BD: usize>(
+  frame: Arc<Frame<T>>, ref_frame: Arc<Frame<T>>, mut config: EncoderConfig,
+  sequence: Arc<Sequence>, buffer: RefMEStats,
 ) -> f64 {
   config.low_latency = true;
   config.speed_settings.multiref = false;
@@ -215,7 +212,7 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
       ],
     }),
   );
-  compute_motion_vectors(&mut fi, &mut fs, &inter_cfg);
+  compute_motion_vectors::<_, BD>(&mut fi, &mut fs, &inter_cfg);
 
   // Estimate inter costs
   let plane_org = &frame.planes[0];
@@ -252,12 +249,11 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
         height: IMPORTANCE_BLOCK_SIZE,
       });
 
-      inter_costs += get_satd(
+      inter_costs += get_satd::<_, BD>(
         &region_org,
         &region_ref,
         bsize.width(),
         bsize.height(),
-        bit_depth,
         fi.cpu_feature_level,
       ) as u64;
     });
@@ -266,7 +262,7 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
 }
 
 #[hawktracer(compute_motion_vectors)]
-pub(crate) fn compute_motion_vectors<T: Pixel>(
+pub(crate) fn compute_motion_vectors<T: Pixel, const BD: usize>(
   fi: &mut FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) {
   let mut blocks = FrameBlocks::new(fi.w_in_b, fi.h_in_b);
@@ -277,6 +273,6 @@ pub(crate) fn compute_motion_vectors<T: Pixel>(
     .into_par_iter()
     .for_each(|mut ctx| {
       let ts = &mut ctx.ts;
-      estimate_tile_motion(fi, ts, inter_cfg);
+      estimate_tile_motion::<_, BD>(fi, ts, inter_cfg);
     });
 }
diff --git a/src/asm/aarch64/cdef.rs b/src/asm/aarch64/cdef.rs
index 2fe70e1248..1c04e50f51 100644
--- a/src/asm/aarch64/cdef.rs
+++ b/src/asm/aarch64/cdef.rs
@@ -67,8 +67,8 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize {
 
 pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
   dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize,
-  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-  bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel,
+  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize,
+  ydec: usize, edges: u8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<T>| {
     rust::cdef_filter_block(
@@ -79,7 +79,6 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
       sec_strength,
       dir,
       damping,
-      bit_depth,
       xdec,
       ydec,
       edges,
diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
index d34286bec7..94cb328702 100644
--- a/src/asm/shared/transform/inverse.rs
+++ b/src/asm/shared/transform/inverse.rs
@@ -17,11 +17,11 @@ pub type InvTxfmFunc =
 pub type InvTxfmHBDFunc =
   unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32);
 
-pub fn call_inverse_func<T: Pixel>(
+pub fn call_inverse_func<T: Pixel, const BD: usize>(
   func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
-  eob: usize, width: usize, height: usize, bd: usize,
+  eob: usize, width: usize, height: usize,
 ) {
-  debug_assert!(bd == 8);
+  debug_assert!(BD == 8);
 
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
@@ -51,7 +51,6 @@ pub fn call_inverse_func<T: Pixel>(
 pub fn call_inverse_hbd_func<T: Pixel>(
   func: InvTxfmHBDFunc, input: &[T::Coeff],
   output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
-  _bd: usize,
 ) {
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
@@ -161,35 +160,32 @@ pub mod test {
         *d = random::<u8>();
         *r = i16::from(*s) - i16::from(*d);
       }
-      forward_transform(
+      forward_transform::<_, 8>(
         res,
         freq,
         tx_size.width(),
         tx_size,
         tx_type,
-        8,
         CpuFeatureLevel::RUST,
       );
 
       let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();
 
-      inverse_transform_add(
+      inverse_transform_add::<_, 8>(
         freq,
         &mut dst.as_region_mut(),
         eob,
         tx_size,
         tx_type,
-        8,
         cpu,
       );
-      inverse_transform_add(
+      inverse_transform_add::<_, 8>(
         freq,
         &mut rust_dst.as_region_mut(),
         eob,
         tx_size,
         tx_type,
-        8,
         CpuFeatureLevel::RUST,
       );
       assert_eq!(rust_dst.data_origin(), dst.data_origin());
diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs
index 8892429052..1ab8be9099 100644
--- a/src/asm/x86/cdef.rs
+++ b/src/asm/x86/cdef.rs
@@ -41,13 +41,13 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize {
   ((ydec << 1) | xdec) & 3
 }
 
-pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
+pub(crate) unsafe fn cdef_filter_block<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize,
-  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-  bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel,
+  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize,
+  ydec: usize, edges: u8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<T>| {
-    rust::cdef_filter_block(
+    rust::cdef_filter_block::<_, _, BD>(
       dst,
       src,
       src_stride,
@@ -55,7 +55,6 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
       sec_strength,
       dir,
       damping,
-      bit_depth,
       xdec,
       ydec,
       edges,
@@ -124,7 +123,7 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
               sec_strength,
               dir as i32,
               damping,
-              (1 << bit_depth) - 1,
+              (1 << BD) - 1,
             );
           }
           None => call_rust(dst),
@@ -316,7 +315,6 @@ mod test {
             let pri_strength = 1;
             let sec_strength = 0;
             let damping = 2;
-            let bit_depth = 8;
 
             // SAFETY: Calling functions with raw pointers--we created the
             // planes above and only read from the start.
@@ -324,8 +322,8 @@ mod test {
             // FIXME: Remove `allow` once https://github.com/rust-lang/rust-clippy/issues/8264 fixed
             #[allow(clippy::undocumented_unsafe_blocks)]
             unsafe {
-              cdef_filter_block(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              cdef_filter_block(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
+              cdef_filter_block::<_, _, 8>(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              cdef_filter_block::<_, _, 8>(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
               assert_eq!(rust_dst.data_origin(), dst.data_origin());
             }
           }
diff --git a/src/asm/x86/dist/cdef_dist.rs b/src/asm/x86/dist/cdef_dist.rs
index 6b590d3730..4b2ab541b3 100644
--- a/src/asm/x86/dist/cdef_dist.rs
+++ b/src/asm/x86/dist/cdef_dist.rs
@@ -53,9 +53,9 @@ extern {
 ///
 /// - If in `check_asm` mode, panics on mismatch between native and ASM results.
 #[allow(clippy::let_and_return)]
-pub fn cdef_dist_kernel<T: Pixel>(
+pub fn cdef_dist_kernel<T: Pixel, const BD: usize>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   debug_assert!(src.plane_cfg.xdec == 0);
   debug_assert!(src.plane_cfg.ydec == 0);
@@ -67,7 +67,7 @@ pub fn cdef_dist_kernel<T: Pixel>(
   debug_assert!(h <= 8);
 
   let call_rust =
-    || -> u32 { rust::cdef_dist_kernel(dst, src, w, h, bit_depth, cpu) };
+    || -> u32 { rust::cdef_dist_kernel::<_, BD>(dst, src, w, h, cpu) };
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
 
@@ -112,7 +112,7 @@ pub fn cdef_dist_kernel<T: Pixel>(
     }
   };
 
-  let dist = apply_ssim_boost(sse, svar, dvar, bit_depth);
+  let dist = apply_ssim_boost::<BD>(sse, svar, dvar);
   #[cfg(feature = "check_asm")]
   assert_eq!(
     dist, ref_dist,
@@ -315,39 +315,39 @@ pub mod test {
 
   #[test]
   fn cdef_dist_simd_random() {
-    cdef_diff_tester(8, random_planes::<u8>);
+    cdef_diff_tester::<_, 8>(random_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_random_hbd() {
-    cdef_diff_tester(10, random_planes::<u16>);
-    cdef_diff_tester(12, random_planes::<u16>);
+    cdef_diff_tester::<_, 10>(random_planes::<u16>);
+    cdef_diff_tester::<_, 12>(random_planes::<u16>);
   }
 
   #[test]
   fn cdef_dist_simd_large() {
-    cdef_diff_tester(8, max_planes::<u8>);
+    cdef_diff_tester::<_, 8>(max_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_large_hbd() {
-    cdef_diff_tester(10, max_planes::<u16>);
-    cdef_diff_tester(12, max_planes::<u16>);
+    cdef_diff_tester::<_, 10>(max_planes::<u16>);
+    cdef_diff_tester::<_, 12>(max_planes::<u16>);
   }
 
   #[test]
   fn cdef_dist_simd_large_diff() {
-    cdef_diff_tester(8, max_diff_planes::<u8>);
+    cdef_diff_tester::<_, 8>(max_diff_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_large_diff_hbd() {
-    cdef_diff_tester(10, max_diff_planes::<u16>);
-    cdef_diff_tester(12, max_diff_planes::<u16>);
+    cdef_diff_tester::<_, 10>(max_diff_planes::<u16, 10>);
+    cdef_diff_tester::<_, 12>(max_diff_planes::<u16>);
   }
 
-  fn cdef_diff_tester<T: Pixel>(
-    bd: usize, gen_planes: fn(bd: usize) -> (Plane<T>, Plane<T>),
+  fn cdef_diff_tester<T: Pixel, const BD: usize>(
+    gen_planes: fn(bd: usize) -> (Plane<T>, Plane<T>),
   ) {
     let (src_plane, dst_plane) = gen_planes(bd);
 
@@ -366,7 +366,6 @@ pub mod test {
           &dst_region,
           w,
           h,
-          bd,
           CpuFeatureLevel::default(),
         );
 
@@ -375,7 +374,6 @@ pub mod test {
           &dst_region,
           w,
           h,
-          bd,
           CpuFeatureLevel::default(),
         );
 
diff --git a/src/asm/x86/dist/mod.rs b/src/asm/x86/dist/mod.rs
index 676787adf3..9dc8207ff6 100644
--- a/src/asm/x86/dist/mod.rs
+++ b/src/asm/x86/dist/mod.rs
@@ -286,11 +286,11 @@ pub(crate) const fn to_index(bsize: BlockSize) -> usize {
 #[allow(clippy::let_and_return)]
 pub fn get_sad<T: Pixel>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
-  let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, bit_depth, cpu) };
+  let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, cpu) };
 
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
@@ -338,13 +338,13 @@ pub fn get_sad<T: Pixel>(
 /// - If in `check_asm` mode, panics on mismatch between native and ASM results.
 #[inline(always)]
 #[allow(clippy::let_and_return)]
-pub fn get_satd<T: Pixel>(
+pub fn get_satd<T: Pixel, const BD: usize>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
-  let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, bit_depth, cpu) };
+  let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, cpu) };
 
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
@@ -374,7 +374,7 @@ pub fn get_satd<T: Pixel>(
             T::to_asm_stride(src.plane_cfg.stride),
             dst.data_ptr() as *const _,
             T::to_asm_stride(dst.plane_cfg.stride),
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           )
         },
         None => call_rust(),
@@ -755,8 +755,8 @@ mod test {
                 *s = random::<u8>() as u16 * $BD / 8;
                 *d = random::<u8>() as u16 * $BD / 8;
               }
-              let result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST);
+              let result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             } else {
@@ -768,8 +768,8 @@ mod test {
                 *s = random::<u8>();
                 *d = random::<u8>();
               }
-              let result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST);
+              let result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             }
diff --git a/src/asm/x86/dist/sse.rs b/src/asm/x86/dist/sse.rs
index 08c710da11..dd4e6ef7de 100644
--- a/src/asm/x86/dist/sse.rs
+++ b/src/asm/x86/dist/sse.rs
@@ -92,8 +92,7 @@ declare_asm_hbd_sse_fn![
 #[allow(clippy::let_and_return)]
 pub fn get_weighted_sse<T: Pixel>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, scale: &[u32],
-  scale_stride: usize, w: usize, h: usize, bit_depth: usize,
-  cpu: CpuFeatureLevel,
+  scale_stride: usize, w: usize, h: usize, cpu: CpuFeatureLevel,
 ) -> u64 {
   // Assembly breaks if imp block size changes.
   assert_eq!(IMPORTANCE_BLOCK_SIZE >> 1, 4);
@@ -101,7 +100,7 @@ pub fn get_weighted_sse<T: Pixel>(
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
   let call_rust = || -> u64 {
-    rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, bit_depth, cpu)
+    rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, cpu)
   };
 
   #[cfg(feature = "check_asm")]
@@ -381,7 +380,6 @@ pub mod test {
         SCALE_STRIDE,
         block.width(),
         block.height(),
-        bd,
         CpuFeatureLevel::default(),
       );
 
@@ -392,7 +390,6 @@ pub mod test {
         SCALE_STRIDE,
         block.width(),
         block.height(),
-        bd,
         CpuFeatureLevel::default(),
       );
 
diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs
index 2f5e0b3b8f..19af81e54b 100644
--- a/src/asm/x86/mc.rs
+++ b/src/asm/x86/mc.rs
@@ -91,15 +91,14 @@ const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize {
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `src`
 #[inline(always)]
-pub fn put_8tap<T: Pixel>(
+pub fn put_8tap<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize,
   height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
-  mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel,
+  mode_y: FilterMode, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::put_8tap(
-      dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth,
-      cpu,
+    rust::put_8tap::<_, BD>(
+      dst, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu,
     );
   };
   #[cfg(feature = "check_asm")]
@@ -149,7 +148,7 @@ pub fn put_8tap<T: Pixel>(
             height as i32,
             col_frac,
             row_frac,
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           ),
           None => call_rust(dst),
         }
@@ -176,15 +175,14 @@ pub fn put_8tap<T: Pixel>(
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `src`
 #[inline(always)]
-pub fn prep_8tap<T: Pixel>(
+pub fn prep_8tap<T: Pixel, const BD: usize>(
   tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize,
   col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) {
   let call_rust = |tmp: &mut [i16]| {
-    rust::prep_8tap(
-      tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth,
-      cpu,
+    rust::prep_8tap::<_, BD>(
+      tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu,
     );
   };
   #[cfg(feature = "check_asm")]
@@ -223,7 +221,7 @@ pub fn prep_8tap<T: Pixel>(
           None => call_rust(tmp),
         }
       }
-      PixelType::U16 if bit_depth > 8 => {
+      PixelType::U16 if BD > 8 => {
         match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] {
           Some(func) => (func)(
             tmp.as_mut_ptr() as *mut _,
@@ -233,7 +231,7 @@ pub fn prep_8tap<T: Pixel>(
             height as i32,
             col_frac,
             row_frac,
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           ),
           None => call_rust(tmp),
         }
@@ -253,12 +251,12 @@ pub fn prep_8tap<T: Pixel>(
 /// - If `width` is not between 2 and 128
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `dst`
-pub fn mc_avg<T: Pixel>(
+pub fn mc_avg<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize,
-  height: usize, bit_depth: usize, cpu: CpuFeatureLevel,
+  height: usize, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::mc_avg(dst, tmp1, tmp2, width, height, bit_depth, cpu);
+    rust::mc_avg::<_, BD>(dst, tmp1, tmp2, width, height, cpu);
   };
   #[cfg(feature = "check_asm")]
   let ref_dst = {
@@ -292,7 +290,7 @@ pub fn mc_avg<T: Pixel>(
         ),
         None => call_rust(dst),
       },
-      PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] {
+      PixelType::U16 if BD > 8 => match AVG_HBD_FNS[cpu.as_index()] {
         Some(func) => (func)(
           dst.data_ptr_mut() as *mut _,
           T::to_asm_stride(dst.plane_cfg.stride),
@@ -300,7 +298,7 @@ pub fn mc_avg<T: Pixel>(
           tmp2.as_ptr(),
           width as i32,
           height as i32,
-          (1 << bit_depth) - 1,
+          (1 << BD) - 1,
         ),
         None => call_rust(dst),
       },
@@ -652,8 +650,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
 
                 assert_eq!(&*dst1.data, &*dst2.data);
               }
@@ -670,8 +668,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
 
                 assert_eq!(&*dst1.data, &*dst2.data);
               }
@@ -740,8 +738,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
               }
             } else {
               // dynamic allocation: test
@@ -752,8 +750,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
               }
             };
 
diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs
index e001ae98ef..67cec03afe 100644
--- a/src/asm/x86/predict.rs
+++ b/src/asm/x86/predict.rs
@@ -194,16 +194,15 @@ decl_cfl_pred_hbd_fn! {
 }
 
 #[inline(always)]
-pub fn dispatch_predict_intra<T: Pixel>(
+pub fn dispatch_predict_intra<T: Pixel, const BD: usize>(
   mode: PredictionMode, variant: PredictionVariant,
-  dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize,
-  ac: &[i16], angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
+  dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16], angle: isize,
+  ief_params: Option<IntraEdgeFilterParameters>,
   edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::dispatch_predict_intra(
-      mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf,
-      cpu,
+    rust::dispatch_predict_intra::<T, BD>(
+      mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu,
     );
   };
 
@@ -362,11 +361,11 @@ pub fn dispatch_predict_intra<T: Pixel>(
           }
         }
       }
-      PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && bit_depth > 8 => {
+      PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && BD > 8 => {
         let dst_ptr = dst.data_ptr_mut() as *mut _;
         let edge_ptr =
           edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _;
-        let bd_max = (1 << bit_depth) - 1;
+        let bd_max = (1 << BD) - 1;
         match mode {
           PredictionMode::DC_PRED => {
             (match variant {
diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
index 28dbabedc7..3228902391 100644
--- a/src/asm/x86/quantize.rs
+++ b/src/asm/x86/quantize.rs
@@ -24,7 +24,6 @@ type DequantizeFn = unsafe fn(
   _eob: usize,
   rcoeffs_ptr: *mut i16,
   tx_size: TxSize,
-  bit_depth: usize,
   dc_delta_q: i8,
   ac_delta_q: i8,
 );
@@ -32,18 +31,17 @@ type DequantizeFn = unsafe fn(
 cpu_function_lookup_table!(
   DEQUANTIZE_FNS: [Option<DequantizeFn>],
   default: None,
-  [(AVX2, Some(dequantize_avx2))]
+  [(AVX2, Some(dequantize_avx2_8bpc))]
 );
 
 #[inline(always)]
-pub fn dequantize<T: Coefficient>(
+pub fn dequantize<T: Coefficient, const BD: usize>(
   qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
+  dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |rcoeffs: &mut [T]| {
-    crate::quantize::rust::dequantize(
-      qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
-      ac_delta_q, cpu,
+    crate::quantize::rust::dequantize::<_, BD>(
+      qindex, coeffs, eob, rcoeffs, tx_size, dc_delta_q, ac_delta_q, cpu,
     );
   };
 
@@ -67,7 +65,6 @@ pub fn dequantize<T: Coefficient>(
             eob,
             rcoeffs.as_mut_ptr() as *mut _,
             tx_size,
-            bit_depth,
             dc_delta_q,
             ac_delta_q,
           )
@@ -87,18 +84,19 @@ pub fn dequantize<T: Coefficient>(
 }
 
 #[target_feature(enable = "avx2")]
-unsafe fn dequantize_avx2(
+unsafe fn dequantize_avx2_8bpc(
   qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
-  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+  tx_size: TxSize, dc_delta_q: i8, ac_delta_q: i8,
 ) {
+  const BD: usize = 8;
   let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32);
 
   let quants_ac =
-    _mm256_set1_epi32(ac_q(qindex, ac_delta_q, bit_depth).get() as i32);
+    _mm256_set1_epi32(ac_q::<BD>(qindex, ac_delta_q).get() as i32);
   // Use the dc quantize as first vector element for the first iteration
   let mut quants = _mm256_insert_epi32(
     quants_ac,
-    dc_q(qindex, dc_delta_q, bit_depth).get() as i32,
+    dc_q::<BD>(qindex, dc_delta_q).get() as i32,
     0,
   );
 
@@ -169,12 +167,10 @@ mod test {
       TX_8X32, TX_32X8, TX_16X64, TX_64X16,
     ];
 
-    let bd: usize = 8;
-
     for &tx_size in &tx_sizes {
       let qindex: u8 = rng.gen_range((MINQ as u8)..(MAXQ as u8));
-      let dc_quant = dc_q(qindex, 0, bd).get() as i16;
-      let ac_quant = ac_q(qindex, 0, bd).get() as i16;
+      let dc_quant = dc_q::<8>(qindex, 0).get() as i16;
+      let ac_quant = ac_q::<8>(qindex, 0).get() as i16;
 
       // Test the min, max, and random eobs
       let eobs = {
@@ -200,13 +196,12 @@ mod test {
         }
 
         // Rely on quantize's internal tests
-        dequantize(
+        dequantize::<_, 8>(
           qindex,
           &qcoeffs.data,
           eob,
           &mut rcoeffs.data,
           tx_size,
-          bd,
           0,
           0,
           CpuFeatureLevel::default(),
diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
index 18b1171517..d341915a0d 100644
--- a/src/asm/x86/transform/forward.rs
+++ b/src/asm/x86/transform/forward.rs
@@ -332,9 +332,9 @@ fn cast_mut<const N: usize, T>(x: &mut [T]) -> &mut [T; N] {
 
 #[allow(clippy::identity_op, clippy::erasing_op)]
 #[target_feature(enable = "avx2")]
-unsafe fn forward_transform_avx2<T: Coefficient>(
+unsafe fn forward_transform_avx2<T: Coefficient, const BD: usize>(
   input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize,
+  tx_type: TxType,
 ) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -350,7 +350,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 
   let mut tmp: Aligned<[I32X8; 64 * 64 / 8]> = Aligned::uninitialized();
   let buf = &mut tmp.data[..txfm_size_col * (txfm_size_row / 8).max(1)];
-  let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
+  let cfg = Txfm2DFlipCfg::fwd::<BD>(tx_type, tx_size);
 
   let txfm_func_col = get_func_i32x8(cfg.txfm_type_col);
   let txfm_func_row = get_func_i32x8(cfg.txfm_type_row);
@@ -507,18 +507,20 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 /// # Panics
 ///
 /// - If called with an invalid combination of `tx_size` and `tx_type`
-pub fn forward_transform<T: Coefficient>(
+pub fn forward_transform<T: Coefficient, const BD: usize>(
   input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  tx_type: TxType, cpu: CpuFeatureLevel,
 ) {
   assert!(valid_av1_transform(tx_size, tx_type));
   if cpu >= CpuFeatureLevel::AVX2 {
     // SAFETY: Calls Assembly code.
     unsafe {
-      forward_transform_avx2(input, output, stride, tx_size, tx_type, bd);
+      forward_transform_avx2::<_, BD>(input, output, stride, tx_size, tx_type);
     }
   } else {
-    rust::forward_transform(input, output, stride, tx_size, tx_type, bd, cpu);
+    rust::forward_transform::<_, BD>(
+      input, output, stride, tx_size, tx_type, cpu,
+    );
   }
 }
 
@@ -562,22 +564,20 @@ mod test {
         let mut output_simd = vec![0i16; area];
 
         println!("Testing combination {:?}, {:?}", tx_size, tx_type);
-        forward_transform(
+        forward_transform::<_, 8>(
           &input[..],
           &mut output_ref[..],
           tx_size.width(),
           tx_size,
           tx_type,
-          8,
           CpuFeatureLevel::RUST,
         );
-        forward_transform(
+        forward_transform::<_, 8>(
           &input[..],
           &mut output_simd[..],
           tx_size.width(),
           tx_size,
           tx_type,
-          8,
           cpu,
         );
         assert_eq!(output_ref, output_simd)
diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs
index 027cdf19b7..84b728d5e0 100644
--- a/src/asm/x86/transform/inverse.rs
+++ b/src/asm/x86/transform/inverse.rs
@@ -16,27 +16,26 @@ use crate::{Pixel, PixelType};
 use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
-pub fn inverse_transform_add<T: Pixel>(
+pub fn inverse_transform_add<T: Pixel, const BD: usize>(
   input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
-  tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  tx_size: TxSize, tx_type: TxType, cpu: CpuFeatureLevel,
 ) {
   match T::type_enum() {
     PixelType::U8 => {
       if let Some(func) = INV_TXFM_FNS[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
-        return call_inverse_func(
+        return call_inverse_func::<_, BD>(
           func,
           input,
           output,
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
-    PixelType::U16 if bd == 10 => {
+    PixelType::U16 if BD == 10 => {
       if let Some(func) = INV_TXFM_HBD_FNS_10[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
@@ -47,11 +46,10 @@ pub fn inverse_transform_add<T: Pixel>(
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
-    PixelType::U16 => {
+    PixelType::U16 if BD == 12 => {
       if let Some(func) = INV_TXFM_HBD_FNS_12[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
@@ -62,13 +60,15 @@ pub fn inverse_transform_add<T: Pixel>(
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
+    _ => unimplemented!(),
   };
 
-  rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
+  rust::inverse_transform_add::<_, BD>(
+    input, output, eob, tx_size, tx_type, cpu,
+  );
 }
 
 macro_rules! decl_itx_fns {
diff --git a/src/cdef.rs b/src/cdef.rs
index 863399036f..71fc1ae2d1 100644
--- a/src/cdef.rs
+++ b/src/cdef.rs
@@ -196,11 +196,14 @@ pub(crate) mod rust {
 
   #[cold_for_target_arch("x86_64")]
   #[allow(clippy::erasing_op, clippy::identity_op, clippy::neg_multiply)]
-  pub(crate) unsafe fn cdef_filter_block<T: Pixel, U: Pixel>(
+  pub(crate) unsafe fn cdef_filter_block<
+    T: Pixel,
+    U: Pixel,
+    const BD: usize,
+  >(
     dst: &mut PlaneRegionMut<'_, T>, input: *const U, istride: isize,
     pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-    bit_depth: usize, xdec: usize, ydec: usize, edges: u8,
-    _cpu: CpuFeatureLevel,
+    xdec: usize, ydec: usize, edges: u8, _cpu: CpuFeatureLevel,
   ) {
     if edges != CDEF_HAVE_ALL {
       // slowpath for unpadded border[s]
@@ -216,7 +219,7 @@ pub(crate) mod rust {
         8 >> ydec,
         edges,
       );
-      cdef_filter_block(
+      cdef_filter_block::<_, _, BD>(
         dst,
         tmp.as_ptr().offset(2 * tmpstride + 2),
         tmpstride,
@@ -224,7 +227,6 @@ pub(crate) mod rust {
         sec_strength,
         dir,
         damping,
-        bit_depth,
         xdec,
         ydec,
         CDEF_HAVE_ALL,
@@ -233,7 +235,7 @@ pub(crate) mod rust {
     } else {
       let xsize = (8 >> xdec) as isize;
       let ysize = (8 >> ydec) as isize;
-      let coeff_shift = bit_depth - 8;
+      let coeff_shift = BD - 8;
       let cdef_pri_taps = [[4, 2], [3, 3]];
       let cdef_sec_taps = [[2, 1], [2, 1]];
       let pri_taps =
@@ -322,7 +324,7 @@ fn adjust_strength(strength: i32, var: i32) -> i32 {
   }
 }
 
-pub fn cdef_analyze_superblock_range<T: Pixel>(
+pub fn cdef_analyze_superblock_range<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, in_frame: &Frame<T>, blocks: &TileBlocks<'_>,
   sb_w: usize, sb_h: usize,
 ) -> Vec<CdefDirections> {
@@ -330,17 +332,17 @@ pub fn cdef_analyze_superblock_range<T: Pixel>(
   for sby in 0..sb_h {
     for sbx in 0..sb_w {
       let sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
-      ret.push(cdef_analyze_superblock(fi, in_frame, blocks, sbo));
+      ret.push(cdef_analyze_superblock::<_, BD>(fi, in_frame, blocks, sbo));
     }
   }
   ret
 }
 
-pub fn cdef_analyze_superblock<T: Pixel>(
+pub fn cdef_analyze_superblock<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, in_frame: &Frame<T>, blocks: &TileBlocks<'_>,
   sbo: TileSuperBlockOffset,
 ) -> CdefDirections {
-  let coeff_shift = fi.sequence.bit_depth - 8;
+  let coeff_shift = BD - 8;
   let mut dir: CdefDirections =
     CdefDirections { dir: [[0; 8]; 8], var: [[0; 8]; 8] };
   // Each direction block is 8x8 in y, and direction computation only looks at y
@@ -396,13 +398,12 @@ pub fn cdef_analyze_superblock<T: Pixel>(
 /// # Panics
 ///
 /// - If called with invalid parameters
-pub fn cdef_filter_superblock<T: Pixel>(
+pub fn cdef_filter_superblock<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, input: &Frame<T>, output: &mut TileMut<'_, T>,
   blocks: &TileBlocks<'_>, tile_sbo: TileSuperBlockOffset, cdef_index: u8,
   cdef_dirs: &CdefDirections,
 ) {
-  let bit_depth = fi.sequence.bit_depth;
-  let coeff_shift = fi.sequence.bit_depth as i32 - 8;
+  let coeff_shift = BD as i32 - 8;
   let cdef_damping = fi.cdef_damping as i32;
   let cdef_y_strength = fi.cdef_y_strengths[cdef_index as usize];
   let cdef_uv_strength = fi.cdef_uv_strengths[cdef_index as usize];
@@ -536,7 +537,7 @@ pub fn cdef_filter_superblock<T: Pixel>(
                 0 <= in_po.y - if edges & CDEF_HAVE_TOP > 0 { 2 } else { 0 }
               );
 
-              cdef_filter_block(
+              cdef_filter_block::<_, BD>(
                 out_block,
                 in_slice.as_ptr(),
                 in_stride as isize,
@@ -544,7 +545,6 @@ pub fn cdef_filter_superblock<T: Pixel>(
                 local_sec_strength,
                 local_dir,
                 local_damping,
-                bit_depth,
                 xdec,
                 ydec,
                 edges,
@@ -592,7 +592,7 @@ pub fn cdef_filter_superblock<T: Pixel>(
 //   don't exist.
 
 #[hawktracer(cdef_filter_tile)]
-pub fn cdef_filter_tile<T: Pixel>(
+pub fn cdef_filter_tile<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, input: &Frame<T>, tb: &TileBlocks,
   output: &mut TileMut<'_, T>,
 ) {
@@ -613,9 +613,10 @@ pub fn cdef_filter_tile<T: Pixel>(
       // the input Frame.
       let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby });
       let cdef_index = tb.get_cdef(tile_sbo);
-      let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo);
+      let cdef_dirs =
+        cdef_analyze_superblock::<_, BD>(fi, input, tb, tile_sbo);
 
-      cdef_filter_superblock(
+      cdef_filter_superblock::<_, BD>(
         fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs,
       );
     }
diff --git a/src/deblock.rs b/src/deblock.rs
index 21a4bf19aa..43afbfb468 100644
--- a/src/deblock.rs
+++ b/src/deblock.rs
@@ -146,9 +146,10 @@ fn deblock_level(
 
 // four taps, 4 outputs (two are trivial)
 #[inline]
-fn filter_narrow2_4(
-  p1: i32, p0: i32, q0: i32, q1: i32, shift: usize,
+fn filter_narrow2_4<const BD: usize>(
+  p1: i32, p0: i32, q0: i32, q1: i32,
 ) -> [i32; 4] {
+  let shift = BD - 8;
   let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1);
   let filter1 =
     clamp(filter0 + 3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3;
@@ -178,28 +179,29 @@ fn filter_narrow2_4(
 
 // six taps, 6 outputs (four are trivial)
 #[inline]
-fn filter_narrow2_6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn filter_narrow2_6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> [i32; 6] {
-  let x = filter_narrow2_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow2_4::<BD>(p1, p0, q0, q1);
   [p2, x[0], x[1], x[2], x[3], q2]
 }
 
 // 12 taps, 12 outputs (ten are trivial)
 #[inline]
-fn filter_narrow2_12(
+fn filter_narrow2_12<const BD: usize>(
   p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32,
-  q2: i32, q3: i32, q4: i32, q5: i32, shift: usize,
+  q2: i32, q3: i32, q4: i32, q5: i32,
 ) -> [i32; 12] {
-  let x = filter_narrow2_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow2_4::<BD>(p1, p0, q0, q1);
   [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5]
 }
 
 // four taps, 4 outputs
 #[inline]
-fn filter_narrow4_4(
-  p1: i32, p0: i32, q0: i32, q1: i32, shift: usize,
+fn filter_narrow4_4<const BD: usize>(
+  p1: i32, p0: i32, q0: i32, q1: i32,
 ) -> [i32; 4] {
+  let shift = BD - 8;
   let filter1 =
     clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3;
   // be certain our optimization removing a clamp is sound
@@ -227,20 +229,20 @@ fn filter_narrow4_4(
 
 // six taps, 6 outputs (two are trivial)
 #[inline]
-fn filter_narrow4_6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn filter_narrow4_6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> [i32; 6] {
-  let x = filter_narrow4_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow4_4::<BD>(p1, p0, q0, q1);
   [p2, x[0], x[1], x[2], x[3], q2]
 }
 
 // 12 taps, 12 outputs (eight are trivial)
 #[inline]
-fn filter_narrow4_12(
+fn filter_narrow4_12<const BD: usize>(
   p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32,
-  q2: i32, q3: i32, q4: i32, q5: i32, shift: usize,
+  q2: i32, q3: i32, q4: i32, q5: i32,
 ) -> [i32; 12] {
-  let x = filter_narrow4_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow4_4::<BD>(p1, p0, q0, q1);
   [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5]
 }
 
@@ -333,57 +335,63 @@ fn stride_sse<const LEN: usize>(a: &[i32; LEN], b: &[i32; LEN]) -> i64 {
 }
 
 #[inline]
-const fn _level_to_limit(level: i32, shift: usize) -> i32 {
+const fn _level_to_limit<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   level << shift
 }
 
 #[inline]
-const fn limit_to_level(limit: i32, shift: usize) -> i32 {
+const fn limit_to_level<const BD: usize>(limit: i32) -> i32 {
+  let shift = BD - 8;
   (limit + (1 << shift) - 1) >> shift
 }
 
 #[inline]
-const fn _level_to_blimit(level: i32, shift: usize) -> i32 {
+const fn _level_to_blimit<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   (3 * level + 4) << shift
 }
 
 #[inline]
-const fn blimit_to_level(blimit: i32, shift: usize) -> i32 {
+const fn blimit_to_level<const BD: usize>(blimit: i32) -> i32 {
+  let shift = BD - 8;
   (((blimit + (1 << shift) - 1) >> shift) - 2) / 3
 }
 
 #[inline]
-const fn _level_to_thresh(level: i32, shift: usize) -> i32 {
+const fn _level_to_thresh<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   level >> 4 << shift
 }
 
 #[inline]
-const fn thresh_to_level(thresh: i32, shift: usize) -> i32 {
+const fn thresh_to_level<const BD: usize>(thresh: i32) -> i32 {
+  let shift = BD - 8;
   (thresh + (1 << shift) - 1) >> shift << 4
 }
 
 #[inline]
-fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize {
-  thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize
+fn nhev4<const BD: usize>(p1: i32, p0: i32, q0: i32, q1: i32) -> usize {
+  thresh_to_level::<BD>(cmp::max((p1 - p0).abs(), (q1 - q0).abs())) as usize
 }
 
 #[inline]
-fn mask4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize {
+fn mask4<const BD: usize>(p1: i32, p0: i32, q0: i32, q1: i32) -> usize {
   cmp::max(
-    limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    limit_to_level::<BD>(cmp::max((p1 - p0).abs(), (q1 - q0).abs())),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
 #[inline]
-fn deblock_size4_inner(
-  [p1, p0, q0, q1]: [i32; 4], level: usize, bd: usize,
+fn deblock_size4_inner<const BD: usize>(
+  [p1, p0, q0, q1]: [i32; 4], level: usize,
 ) -> Option<[i32; 4]> {
-  if mask4(p1, p0, q0, q1, bd - 8) <= level {
-    let x = if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_4(p1, p0, q0, q1, bd - 8)
+  if mask4::<BD>(p1, p0, q0, q1) <= level {
+    let x = if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_4::<BD>(p1, p0, q0, q1)
     } else {
-      filter_narrow2_4(p1, p0, q0, q1, bd - 8)
+      filter_narrow2_4::<BD>(p1, p0, q0, q1)
     };
     Some(x)
   } else {
@@ -392,26 +400,26 @@ fn deblock_size4_inner(
 }
 
 // Assumes rec[0] is set 2 taps back from the edge
-fn deblock_v_size4<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size4<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
     let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_()];
-    if let Some(data) = deblock_size4_inner(vals, level, bd) {
+    if let Some(data) = deblock_size4_inner::<BD>(vals, level) {
       copy_horizontal(rec, 0, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 2 taps back from the edge
-fn deblock_h_size4<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size4<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals =
       [rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_()];
-    if let Some(data) = deblock_size4_inner(vals, level, bd) {
+    if let Some(data) = deblock_size4_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 0, &data);
     }
   }
@@ -419,9 +427,9 @@ fn deblock_h_size4<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 2 taps back from the edge.
 // Accesses four taps, accumulates four pixels into the tally
-fn sse_size4<T: Pixel>(
+fn sse_size4<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
   for i in 0..4 {
     let (p1, p0, q0, q1, a) = if horizontal_p {
@@ -445,13 +453,13 @@ fn sse_size4<T: Pixel>(
     // three possibilities: no filter, narrow2 and narrow4
     // All possibilities produce four outputs
     let none: [_; 4] = [p1, p0, q0, q1];
-    let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8);
-    let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8);
+    let narrow2 = filter_narrow2_4::<BD>(p1, p0, q0, q1);
+    let narrow4 = filter_narrow4_4::<BD>(p1, p0, q0, q1);
 
     // mask4 sets the dividing line for filter vs no filter
     // nhev4 sets the dividing line between narrow2 and narrow4
-    let mask = clamp(mask4(p1, p0, q0, q1, bd - 8), 1, MAX_LOOP_FILTER + 1);
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let mask = clamp(mask4::<BD>(p1, p0, q0, q1), 1, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -474,18 +482,15 @@ fn sse_size4<T: Pixel>(
 }
 
 #[inline]
-fn mask6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn mask6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> usize {
   cmp::max(
-    limit_to_level(
-      cmp::max(
-        (p2 - p1).abs(),
-        cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())),
-      ),
-      shift,
-    ),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    limit_to_level::<BD>(cmp::max(
+      (p2 - p1).abs(),
+      cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())),
+    )),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
@@ -498,17 +503,17 @@ fn flat6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> usize {
 }
 
 #[inline]
-fn deblock_size6_inner(
-  [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize, bd: usize,
+fn deblock_size6_inner<const BD: usize>(
+  [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize,
 ) -> Option<[i32; 4]> {
-  if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask6::<BD>(p2, p1, p0, q0, q1, q2) <= level {
+    let flat = 1 << (BD - 8);
     let x = if flat6(p2, p1, p0, q0, q1, q2) <= flat {
       filter_wide6_4(p2, p1, p0, q0, q1, q2)
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_4(p1, p0, q0, q1, bd - 8)
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_4::<BD>(p1, p0, q0, q1)
     } else {
-      filter_narrow2_4(p1, p0, q0, q1, bd - 8)
+      filter_narrow2_4::<BD>(p1, p0, q0, q1)
     };
     Some(x)
   } else {
@@ -517,22 +522,22 @@ fn deblock_size6_inner(
 }
 
 // Assumes slice[0] is set 3 taps back from the edge
-fn deblock_v_size6<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size6<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
     let vals =
       [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_()];
-    if let Some(data) = deblock_size6_inner(vals, level, bd) {
+    if let Some(data) = deblock_size6_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes slice[0] is set 3 taps back from the edge
-fn deblock_h_size6<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size6<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -543,7 +548,7 @@ fn deblock_h_size6<T: Pixel>(
       rec[4][x].as_(),
       rec[5][x].as_(),
     ];
-    if let Some(data) = deblock_size6_inner(vals, level, bd) {
+    if let Some(data) = deblock_size6_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -551,11 +556,11 @@ fn deblock_h_size6<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 3 taps back from the edge.
 // Accesses six taps, accumulates four pixels into the tally
-fn sse_size6<T: Pixel>(
+fn sse_size6<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
   for i in 0..4 {
     let (p2, p1, p0, q0, q1, q2, a) = if horizontal_p {
       // six taps
@@ -587,16 +592,16 @@ fn sse_size6<T: Pixel>(
     // All possibilities produce four outputs
     let none: [_; 4] = [p1, p0, q0, q1];
     let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2);
-    let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8);
-    let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8);
+    let narrow2 = filter_narrow2_4::<BD>(p1, p0, q0, q1);
+    let narrow4 = filter_narrow4_4::<BD>(p1, p0, q0, q1);
 
     // mask6 sets the dividing line for filter vs no filter
     // flat6 decides between wide and narrow filters (unrelated to level)
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask =
-      clamp(mask6(p2, p1, p0, q0, q1, q2, bd - 8), 1, MAX_LOOP_FILTER + 1);
+      clamp(mask6::<BD>(p2, p1, p0, q0, q1, q2), 1, MAX_LOOP_FILTER + 1);
     let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -627,28 +632,24 @@ fn sse_size6<T: Pixel>(
 }
 
 #[inline]
-fn mask8(
+fn mask8<const BD: usize>(
   p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32,
-  shift: usize,
 ) -> usize {
   cmp::max(
-    limit_to_level(
+    limit_to_level::<BD>(cmp::max(
+      (p3 - p2).abs(),
       cmp::max(
-        (p3 - p2).abs(),
+        (p2 - p1).abs(),
         cmp::max(
-          (p2 - p1).abs(),
+          (p1 - p0).abs(),
           cmp::max(
-            (p1 - p0).abs(),
-            cmp::max(
-              (q3 - q2).abs(),
-              cmp::max((q2 - q1).abs(), (q1 - q0).abs()),
-            ),
+            (q3 - q2).abs(),
+            cmp::max((q2 - q1).abs(), (q1 - q0).abs()),
           ),
         ),
       ),
-      shift,
-    ),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    )),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
@@ -669,17 +670,17 @@ fn flat8(
 }
 
 #[inline]
-fn deblock_size8_inner(
-  [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize, bd: usize,
+fn deblock_size8_inner<const BD: usize>(
+  [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize,
 ) -> Option<[i32; 6]> {
-  if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3) <= level {
+    let flat = 1 << (BD - 8);
     let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat {
       filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3)
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8)
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_6::<BD>(p2, p1, p0, q0, q1, q2)
     } else {
-      filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8)
+      filter_narrow2_6::<BD>(p2, p1, p0, q0, q1, q2)
     };
     Some(x)
   } else {
@@ -688,8 +689,8 @@ fn deblock_size8_inner(
 }
 
 // Assumes rec[0] is set 4 taps back from the edge
-fn deblock_v_size8<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size8<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
@@ -703,15 +704,15 @@ fn deblock_v_size8<T: Pixel>(
       p[6].as_(),
       p[7].as_(),
     ];
-    if let Some(data) = deblock_size8_inner(vals, level, bd) {
+    if let Some(data) = deblock_size8_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 4 taps back from the edge
-fn deblock_h_size8<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size8<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -724,7 +725,7 @@ fn deblock_h_size8<T: Pixel>(
       rec[6][x].as_(),
       rec[7][x].as_(),
     ];
-    if let Some(data) = deblock_size8_inner(vals, level, bd) {
+    if let Some(data) = deblock_size8_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -732,11 +733,11 @@ fn deblock_h_size8<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 4 taps back from the edge.
 // Accesses eight taps, accumulates six pixels into the tally
-fn sse_size8<T: Pixel>(
+fn sse_size8<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
 
   for i in 0..4 {
     let (p3, p2, p1, p0, q0, q1, q2, q3, a) = if horizontal_p {
@@ -786,19 +787,19 @@ fn sse_size8<T: Pixel>(
     // Four possibilities: no filter, wide8, narrow2 and narrow4
     let none: [_; 6] = [p2, p1, p0, q0, q1, q2];
     let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3);
-    let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8);
-    let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8);
+    let narrow2: [_; 6] = filter_narrow2_6::<BD>(p2, p1, p0, q0, q1, q2);
+    let narrow4: [_; 6] = filter_narrow4_6::<BD>(p2, p1, p0, q0, q1, q2);
 
     // mask8 sets the dividing line for filter vs no filter
     // flat8 decides between wide and narrow filters (unrelated to level)
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask = clamp(
-      mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8),
+      mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3),
       1,
       MAX_LOOP_FILTER + 1,
     );
     let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -845,13 +846,13 @@ fn flat14_outer(
 }
 
 #[inline]
-fn deblock_size14_inner(
+fn deblock_size14_inner<const BD: usize>(
   [p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6]: [i32; 14],
-  level: usize, bd: usize,
+  level: usize,
 ) -> Option<[i32; 12]> {
   // 'mask' test
-  if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3) <= level {
+    let flat = 1 << (BD - 8);
     // inner flatness test
     let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat {
       // outer flatness test
@@ -864,11 +865,11 @@ fn deblock_size14_inner(
         // only flat in inner area, run 8-tap
         filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
       }
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
       // not flat, run narrow filter
-      filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8)
+      filter_narrow4_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
     } else {
-      filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8)
+      filter_narrow2_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
     };
     Some(x)
   } else {
@@ -877,8 +878,8 @@ fn deblock_size14_inner(
 }
 
 // Assumes rec[0] is set 7 taps back from the edge
-fn deblock_v_size14<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size14<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
@@ -898,15 +899,15 @@ fn deblock_v_size14<T: Pixel>(
       p[12].as_(),
       p[13].as_(),
     ];
-    if let Some(data) = deblock_size14_inner(vals, level, bd) {
+    if let Some(data) = deblock_size14_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 7 taps back from the edge
-fn deblock_h_size14<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size14<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -925,7 +926,7 @@ fn deblock_h_size14<T: Pixel>(
       rec[12][x].as_(),
       rec[13][x].as_(),
     ];
-    if let Some(data) = deblock_size14_inner(vals, level, bd) {
+    if let Some(data) = deblock_size14_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -933,11 +934,11 @@ fn deblock_h_size14<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 7 taps back from the edge.
 // Accesses fourteen taps, accumulates twelve pixels into the tally
-fn sse_size14<T: Pixel>(
+fn sse_size14<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
   for i in 0..4 {
     let (p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, a) =
       if horizontal_p {
@@ -1014,49 +1015,23 @@ fn sse_size14<T: Pixel>(
       filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6);
     let wide8 =
       filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
-    let narrow2 = filter_narrow2_12(
-      p5,
-      p4,
-      p3,
-      p2,
-      p1,
-      p0,
-      q0,
-      q1,
-      q2,
-      q3,
-      q4,
-      q5,
-      bd - 8,
-    );
-    let narrow4 = filter_narrow4_12(
-      p5,
-      p4,
-      p3,
-      p2,
-      p1,
-      p0,
-      q0,
-      q1,
-      q2,
-      q3,
-      q4,
-      q5,
-      bd - 8,
-    );
+    let narrow2 =
+      filter_narrow2_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
+    let narrow4 =
+      filter_narrow4_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
 
     // mask8 sets the dividing line for filter vs no filter
     // flat8 decides between wide and narrow filters (unrelated to level)
     // flat14 decides between wide14 and wide8 filters
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask = clamp(
-      mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8),
+      mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3),
       1,
       MAX_LOOP_FILTER + 1,
     );
     let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat;
     let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -1098,9 +1073,9 @@ fn sse_size14<T: Pixel>(
   }
 }
 
-fn filter_v_edge<T: Pixel>(
+fn filter_v_edge<T: Pixel, const BD: usize>(
   deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset,
-  p: &mut PlaneRegionMut<T>, pli: usize, bd: usize, xdec: usize, ydec: usize,
+  p: &mut PlaneRegionMut<T>, pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1126,16 +1101,16 @@ fn filter_v_edge<T: Pixel>(
         });
         match filter_size {
           4 => {
-            deblock_v_size4(&mut plane_region, level, bd);
+            deblock_v_size4::<_, BD>(&mut plane_region, level);
           }
           6 => {
-            deblock_v_size6(&mut plane_region, level, bd);
+            deblock_v_size6::<_, BD>(&mut plane_region, level);
           }
           8 => {
-            deblock_v_size8(&mut plane_region, level, bd);
+            deblock_v_size8::<_, BD>(&mut plane_region, level);
           }
           14 => {
-            deblock_v_size14(&mut plane_region, level, bd);
+            deblock_v_size14::<_, BD>(&mut plane_region, level);
           }
           _ => unreachable!(),
         }
@@ -1144,10 +1119,10 @@ fn filter_v_edge<T: Pixel>(
   }
 }
 
-fn sse_v_edge<T: Pixel>(
+fn sse_v_edge<T: Pixel, const BD: usize>(
   blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion<T>,
   src_plane: &PlaneRegion<T>, tally: &mut [i64; MAX_LOOP_FILTER + 2],
-  pli: usize, bd: usize, xdec: usize, ydec: usize,
+  pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1177,16 +1152,16 @@ fn sse_v_edge<T: Pixel>(
       });
       match filter_size {
         4 => {
-          sse_size4(&rec_region, &src_region, tally, false, bd);
+          sse_size4::<_, BD>(&rec_region, &src_region, tally, false);
         }
         6 => {
-          sse_size6(&rec_region, &src_region, tally, false, bd);
+          sse_size6::<_, BD>(&rec_region, &src_region, tally, false);
         }
         8 => {
-          sse_size8(&rec_region, &src_region, tally, false, bd);
+          sse_size8::<_, BD>(&rec_region, &src_region, tally, false);
         }
         14 => {
-          sse_size14(&rec_region, &src_region, tally, false, bd);
+          sse_size14::<_, BD>(&rec_region, &src_region, tally, false);
         }
         _ => unreachable!(),
       }
@@ -1194,9 +1169,9 @@ fn sse_v_edge<T: Pixel>(
   }
 }
 
-fn filter_h_edge<T: Pixel>(
+fn filter_h_edge<T: Pixel, const BD: usize>(
   deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset,
-  p: &mut PlaneRegionMut<T>, pli: usize, bd: usize, xdec: usize, ydec: usize,
+  p: &mut PlaneRegionMut<T>, pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1222,16 +1197,16 @@ fn filter_h_edge<T: Pixel>(
         });
         match filter_size {
           4 => {
-            deblock_h_size4(&mut plane_region, level, bd);
+            deblock_h_size4::<_, BD>(&mut plane_region, level);
           }
           6 => {
-            deblock_h_size6(&mut plane_region, level, bd);
+            deblock_h_size6::<_, BD>(&mut plane_region, level);
           }
           8 => {
-            deblock_h_size8(&mut plane_region, level, bd);
+            deblock_h_size8::<_, BD>(&mut plane_region, level);
           }
           14 => {
-            deblock_h_size14(&mut plane_region, level, bd);
+            deblock_h_size14::<_, BD>(&mut plane_region, level);
           }
           _ => unreachable!(),
         }
@@ -1240,10 +1215,10 @@ fn filter_h_edge<T: Pixel>(
   }
 }
 
-fn sse_h_edge<T: Pixel>(
+fn sse_h_edge<T: Pixel, const BD: usize>(
   blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion<T>,
   src_plane: &PlaneRegion<T>, tally: &mut [i64; MAX_LOOP_FILTER + 2],
-  pli: usize, bd: usize, xdec: usize, ydec: usize,
+  pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1274,16 +1249,16 @@ fn sse_h_edge<T: Pixel>(
 
       match filter_size {
         4 => {
-          sse_size4(&rec_region, &src_region, tally, true, bd);
+          sse_size4::<_, BD>(&rec_region, &src_region, tally, true);
         }
         6 => {
-          sse_size6(&rec_region, &src_region, tally, true, bd);
+          sse_size6::<_, BD>(&rec_region, &src_region, tally, true);
         }
         8 => {
-          sse_size8(&rec_region, &src_region, tally, true, bd);
+          sse_size8::<_, BD>(&rec_region, &src_region, tally, true);
         }
         14 => {
-          sse_size14(&rec_region, &src_region, tally, true, bd);
+          sse_size14::<_, BD>(&rec_region, &src_region, tally, true);
         }
         _ => unreachable!(),
       }
@@ -1293,9 +1268,9 @@ fn sse_h_edge<T: Pixel>(
 
 // Deblocks all edges, vertical and horizontal, in a single plane
 #[hawktracer(deblock_plane)]
-pub fn deblock_plane<T: Pixel>(
+pub fn deblock_plane<T: Pixel, const BD: usize>(
   deblock: &DeblockState, p: &mut PlaneRegionMut<T>, pli: usize,
-  blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize,
+  blocks: &TileBlocks, crop_w: usize, crop_h: usize,
 ) {
   let xdec = p.plane_cfg.xdec;
   let ydec = p.plane_cfg.ydec;
@@ -1339,26 +1314,24 @@ pub fn deblock_plane<T: Pixel>(
   // edge).  Unroll to avoid corner-cases.
   if rows > 0 {
     for x in (1 << xdec..cols).step_by(1 << xdec) {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y: 0 }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     if rows > 1 << ydec {
       for x in (1 << xdec..cols).step_by(1 << xdec) {
-        filter_v_edge(
+        filter_v_edge::<_, BD>(
           deblock,
           blocks,
           TileBlockOffset(BlockOffset { x, y: 1 << ydec }),
           p,
           pli,
-          bd,
           xdec,
           ydec,
         );
@@ -1371,13 +1344,12 @@ pub fn deblock_plane<T: Pixel>(
   for y in ((2 << ydec)..rows).step_by(1 << ydec) {
     // Check for vertical edge at first MI block boundary on this row
     if cols > 1 << xdec {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x: 1 << xdec, y }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1385,17 +1357,16 @@ pub fn deblock_plane<T: Pixel>(
     // run the rest of the row with both vertical and horizontal edge filtering.
     // Horizontal lags vertical edge by one row and two columns.
     for x in (2 << xdec..cols).step_by(1 << xdec) {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1404,14 +1375,13 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     // ..and the last two horizontal edges for the row
     if cols >= 2 << xdec {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1420,13 +1390,12 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     if cols >= 1 << xdec {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1435,7 +1404,6 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1445,13 +1413,12 @@ pub fn deblock_plane<T: Pixel>(
   // Last horizontal row, vertical is already complete
   if rows > 1 << ydec {
     for x in (0..cols).step_by(1 << xdec) {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y: rows - (1 << ydec) }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1460,11 +1427,11 @@ pub fn deblock_plane<T: Pixel>(
 }
 
 // sse count of all edges in a single plane, accumulates into vertical and horizontal counts
-fn sse_plane<T: Pixel>(
+fn sse_plane<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<T>, src: &PlaneRegion<T>,
   v_sse: &mut [i64; MAX_LOOP_FILTER + 2],
   h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, blocks: &TileBlocks,
-  crop_w: usize, crop_h: usize, bd: usize,
+  crop_w: usize, crop_h: usize,
 ) {
   let xdec = rec.plane_cfg.xdec;
   let ydec = rec.plane_cfg.ydec;
@@ -1485,14 +1452,13 @@ fn sse_plane<T: Pixel>(
 
   // No horizontal edge filtering along top of frame
   for x in (1 << xdec..cols).step_by(1 << xdec) {
-    sse_v_edge(
+    sse_v_edge::<_, BD>(
       blocks,
       TileBlockOffset(BlockOffset { x, y: 0 }),
       rec,
       src,
       v_sse,
       pli,
-      bd,
       xdec,
       ydec,
     );
@@ -1503,37 +1469,34 @@ fn sse_plane<T: Pixel>(
   // behind vertical.
   for y in (1 << ydec..rows).step_by(1 << ydec) {
     // No vertical filtering along left edge of frame
-    sse_h_edge(
+    sse_h_edge::<_, BD>(
       blocks,
       TileBlockOffset(BlockOffset { x: 0, y }),
       rec,
       src,
       h_sse,
       pli,
-      bd,
       xdec,
       ydec,
     );
     for x in (1 << xdec..cols).step_by(1 << xdec) {
-      sse_v_edge(
+      sse_v_edge::<_, BD>(
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         rec,
         src,
         v_sse,
         pli,
-        bd,
         xdec,
         ydec,
       );
-      sse_h_edge(
+      sse_h_edge::<_, BD>(
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         rec,
         src,
         h_sse,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1543,18 +1506,18 @@ fn sse_plane<T: Pixel>(
 
 // Deblocks all edges in all planes of a frame
 #[hawktracer(deblock_filter_frame)]
-pub fn deblock_filter_frame<T: Pixel>(
+pub fn deblock_filter_frame<T: Pixel, const BD: usize>(
   deblock: &DeblockState, tile: &mut TileMut<T>, blocks: &TileBlocks,
-  crop_w: usize, crop_h: usize, bd: usize, planes: usize,
+  crop_w: usize, crop_h: usize, planes: usize,
 ) {
   tile.planes[..planes].par_iter_mut().enumerate().for_each(|(pli, plane)| {
-    deblock_plane(deblock, plane, pli, blocks, crop_w, crop_h, bd);
+    deblock_plane::<_, BD>(deblock, plane, pli, blocks, crop_w, crop_h);
   });
 }
 
-fn sse_optimize<T: Pixel>(
+fn sse_optimize<T: Pixel, const BD: usize>(
   rec: &Tile<T>, input: &Tile<T>, blocks: &TileBlocks, crop_w: usize,
-  crop_h: usize, bd: usize, monochrome: bool,
+  crop_h: usize, monochrome: bool,
 ) -> [u8; 4] {
   // i64 allows us to accumulate a total of ~ 35 bits worth of pixels
   assert!(
@@ -1569,7 +1532,7 @@ fn sse_optimize<T: Pixel>(
     let mut v_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2];
     let mut h_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2];
 
-    sse_plane(
+    sse_plane::<_, BD>(
       &rec.planes[pli],
       &input.planes[pli],
       &mut v_tally,
@@ -1578,7 +1541,6 @@ fn sse_optimize<T: Pixel>(
       blocks,
       crop_w,
       crop_h,
-      bd,
     );
 
     for i in 1..=MAX_LOOP_FILTER {
@@ -1619,14 +1581,14 @@ fn sse_optimize<T: Pixel>(
 }
 
 #[hawktracer(deblock_filter_optimize)]
-pub fn deblock_filter_optimize<T: Pixel, U: Pixel>(
+pub fn deblock_filter_optimize<T: Pixel, U: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, rec: &Tile<U>, input: &Tile<U>,
   blocks: &TileBlocks, crop_w: usize, crop_h: usize,
 ) -> [u8; 4] {
   if fi.config.speed_settings.fast_deblock {
-    let q = ac_q(fi.base_q_idx, 0, fi.sequence.bit_depth).get() as i32;
+    let q = ac_q::<BD>(fi.base_q_idx, 0).get() as i32;
     let level = clamp(
-      match fi.sequence.bit_depth {
+      match BD {
         8 => {
           if fi.frame_type == FrameType::KEY {
             (q * 17563 - 421_574 + (1 << 18 >> 1)) >> 18
@@ -1657,13 +1619,12 @@ pub fn deblock_filter_optimize<T: Pixel, U: Pixel>(
   } else {
     // Deblocking happens in 4x4 (luma) units; luma x,y are clipped to
     // the *crop frame* of the entire frame by 4x4 block.
-    sse_optimize(
+    sse_optimize::<_, BD>(
       rec,
       input,
       blocks,
       crop_w,
       crop_h,
-      fi.sequence.bit_depth,
       fi.sequence.chroma_sampling == Cs400,
     )
   }
diff --git a/src/dist.rs b/src/dist.rs
index 4b5536a841..453dfa99a1 100644
--- a/src/dist.rs
+++ b/src/dist.rs
@@ -32,7 +32,7 @@ pub(crate) mod rust {
   /// w and h can be at most 128, the size of the largest block.
   pub fn get_sad<T: Pixel>(
     plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
-    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
+    h: usize, _cpu: CpuFeatureLevel,
   ) -> u32 {
     debug_assert!(w <= 128 && h <= 128);
     let plane_org =
@@ -157,7 +157,7 @@ pub(crate) mod rust {
   /// 4x4 transforms instead of 8x8 transforms when width or height < 8.
   pub fn get_satd<T: Pixel>(
     plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
-    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
+    h: usize, _cpu: CpuFeatureLevel,
   ) -> u32 {
     assert!(w <= 128 && h <= 128);
     assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
@@ -186,9 +186,8 @@ pub(crate) mod rust {
 
         // Revert to sad on edge blocks (frame edges)
         if chunk_w != size || chunk_h != size {
-          sum += get_sad(
-            &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
-          ) as u64;
+          sum +=
+            get_sad(&chunk_org, &chunk_ref, chunk_w, chunk_h, _cpu) as u64;
           continue;
         }
 
@@ -235,8 +234,7 @@ pub(crate) mod rust {
   #[inline(never)]
   pub fn get_weighted_sse<T: Pixel>(
     src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
-    scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
-    _cpu: CpuFeatureLevel,
+    scale_stride: usize, w: usize, h: usize, _cpu: CpuFeatureLevel,
   ) -> u64 {
     let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
     // Always chunk and apply scaling on the sse of squares the size of
@@ -301,9 +299,9 @@ pub(crate) mod rust {
   /// Computes a distortion metric of the sum of squares weighted by activity.
   /// w and h should be <= 8.
   #[inline(never)]
-  pub fn cdef_dist_kernel<T: Pixel>(
+  pub fn cdef_dist_kernel<T: Pixel, const BD: usize>(
     src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-    bit_depth: usize, _cpu: CpuFeatureLevel,
+    _cpu: CpuFeatureLevel,
   ) -> u32 {
     // TODO: Investigate using different constants in ssim boost for block sizes
     // smaller than 8x8.
@@ -370,7 +368,7 @@ pub(crate) mod rust {
     dvar =
       ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
 
-    apply_ssim_boost(sse, svar, dvar, bit_depth)
+    apply_ssim_boost::<BD>(sse, svar, dvar)
   }
 }
 
@@ -453,14 +451,7 @@ pub mod test {
 
       assert_eq!(
         distortion,
-        get_sad(
-          &input_region,
-          &rec_region,
-          w,
-          h,
-          bit_depth,
-          CpuFeatureLevel::default()
-        )
+        get_sad(&input_region, &rec_region, w, h, CpuFeatureLevel::default())
       );
     }
   }
@@ -475,7 +466,7 @@ pub mod test {
     get_sad_same_inner::<u16>();
   }
 
-  fn get_satd_same_inner<T: Pixel>() {
+  fn get_satd_same_inner<T: Pixel, const BD: usize>() {
     let blocks: Vec<(usize, usize, u32)> = vec![
       (4, 4, 1408),
       (4, 8, 2016),
@@ -501,7 +492,6 @@ pub mod test {
       (64, 16, 21312),
     ];
 
-    let bit_depth: usize = 8;
     let (input_plane, rec_plane) = setup_planes::<T>();
 
     for (w, h, distortion) in blocks {
@@ -512,12 +502,11 @@ pub mod test {
 
       assert_eq!(
         distortion,
-        get_satd(
+        get_satd::<_, BD>(
           &input_region,
           &rec_region,
           w,
           h,
-          bit_depth,
           CpuFeatureLevel::default()
         )
       );
@@ -526,11 +515,11 @@ pub mod test {
 
   #[test]
   fn get_satd_same_u8() {
-    get_satd_same_inner::<u8>();
+    get_satd_same_inner::<u8, 8>();
   }
 
   #[test]
   fn get_satd_same_u16() {
-    get_satd_same_inner::<u16>();
+    get_satd_same_inner::<u16, 10>();
   }
 }
diff --git a/src/encoder.rs b/src/encoder.rs
index 2b8d2ee80e..c2b45ee37f 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -570,12 +570,12 @@ pub struct SegmentationState {
 }
 
 impl SegmentationState {
-  pub fn update_threshold(&mut self, base_q_idx: u8, bd: usize) {
-    let base_ac_q = ac_q(base_q_idx, 0, bd).get() as u64;
+  pub fn update_threshold<const BD: usize>(&mut self, base_q_idx: u8) {
+    let base_ac_q = ac_q::<BD>(base_q_idx, 0).get() as u64;
     let real_ac_q = ArrayVec::<_, MAX_SEGMENTS>::from_iter(
       self.data[..=self.max_segment as usize].iter().map(|data| {
-        ac_q(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8, bd).get()
-          as u64
+        ac_q::<BD>(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8)
+          .get() as u64
       }),
     );
     self.threshold.fill(DistortionScale(0));
@@ -1246,15 +1246,16 @@ impl<T: Pixel> FrameInvariants<T> {
       (uv_f1 * CDEF_SEC_STRENGTHS as i32 + uv_f2) as u8;
   }
 
-  pub fn set_quantizers(&mut self, qps: &QuantizerParameters) {
+  pub fn set_quantizers<const BD: usize>(
+    &mut self, qps: &QuantizerParameters,
+  ) {
     self.base_q_idx = qps.ac_qi[0];
     let base_q_idx = self.base_q_idx as i32;
     for pi in 0..3 {
       self.dc_delta_q[pi] = (qps.dc_qi[pi] as i32 - base_q_idx) as i8;
       self.ac_delta_q[pi] = (qps.ac_qi[pi] as i32 - base_q_idx) as i8;
     }
-    self.lambda =
-      qps.lambda * ((1 << (2 * (self.sequence.bit_depth - 8))) as f64);
+    self.lambda = qps.lambda * ((1 << (2 * (BD - 8))) as f64);
     self.me_lambda = self.lambda.sqrt();
     self.dist_scale = qps.dist_scale.map(DistortionScale::from);
 
@@ -1394,7 +1395,7 @@ fn get_qidx<T: Pixel>(
 ///
 /// - If the block size is invalid for subsampling
 /// - If a tx type other than DCT is used for 64x64 blocks
-pub fn encode_tx_block<T: Pixel, W: Writer>(
+pub fn encode_tx_block<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>,
   ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter,
@@ -1465,8 +1466,7 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   let rec = &mut ts.rec.planes[p];
 
   if mode.is_intra() {
-    let bit_depth = fi.sequence.bit_depth;
-    let edge_buf = get_intra_edges(
+    let edge_buf = get_intra_edges::<_, BD>(
       &rec.as_const(),
       tile_partition_bo,
       bx,
@@ -1474,17 +1474,15 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
       bsize,
       po,
       tx_size,
-      bit_depth,
       Some(mode),
       fi.sequence.enable_intra_edge_filter,
       pred_intra_param,
     );
 
-    mode.predict_intra(
+    mode.predict_intra::<_, BD>(
       tile_rect,
       &mut rec.subregion_mut(area),
       tx_size,
-      bit_depth,
       ac,
       pred_intra_param,
       ief_params,
@@ -1536,13 +1534,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     residual.fill(0);
   }
 
-  forward_transform(
+  forward_transform::<_, BD>(
     residual,
     coeffs,
     tx_size.width(),
     tx_size,
     tx_type,
-    fi.sequence.bit_depth,
     fi.cpu_feature_level,
   );
 
@@ -1579,13 +1576,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   };
 
   // Reconstruct
-  dequantize(
+  dequantize::<_, BD>(
     qidx,
     qcoeffs,
     eob,
     rcoeffs,
     tx_size,
-    fi.sequence.bit_depth,
     fi.dc_delta_q[p],
     fi.ac_delta_q[p],
     fi.cpu_feature_level,
@@ -1594,13 +1590,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   if eob == 0 {
     // All zero coefficients is a no-op
   } else if !fi.use_tx_domain_distortion || need_recon_pixel {
-    inverse_transform_add(
+    inverse_transform_add::<_, BD>(
       rcoeffs,
       &mut rec.subregion_mut(area),
       eob,
       tx_size,
       tx_type,
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     );
   }
@@ -1654,7 +1649,7 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
 /// # Panics
 ///
 /// - If the block size is invalid for subsampling
-pub fn motion_compensate<T: Pixel>(
+pub fn motion_compensate<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, luma_mode: PredictionMode, ref_frames: [RefType; 2],
   mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -1713,7 +1708,7 @@ pub fn motion_compensate<T: Pixel>(
       };
 
       if some_use_intra {
-        luma_mode.predict_inter(
+        luma_mode.predict_inter::<_, BD>(
           fi,
           tile_rect,
           p,
@@ -1741,7 +1736,7 @@ pub fn motion_compensate<T: Pixel>(
           let area2 = Area::StartingAt { x: po2.x, y: po2.y };
           let po3 = PlaneOffset { x: po.x + 2, y: po.y + 2 };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1753,7 +1748,7 @@ pub fn motion_compensate<T: Pixel>(
             mv0,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1765,7 +1760,7 @@ pub fn motion_compensate<T: Pixel>(
             mv1,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1777,7 +1772,7 @@ pub fn motion_compensate<T: Pixel>(
             mv2,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1793,7 +1788,7 @@ pub fn motion_compensate<T: Pixel>(
         if bsize == BlockSize::BLOCK_8X4 {
           let mv1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].mv;
           let rf1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].ref_frames;
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1807,7 +1802,7 @@ pub fn motion_compensate<T: Pixel>(
           );
           let po3 = PlaneOffset { x: po.x, y: po.y + 2 };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1823,7 +1818,7 @@ pub fn motion_compensate<T: Pixel>(
         if bsize == BlockSize::BLOCK_4X8 {
           let mv2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].mv;
           let rf2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].ref_frames;
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1837,7 +1832,7 @@ pub fn motion_compensate<T: Pixel>(
           );
           let po3 = PlaneOffset { x: po.x + 2, y: po.y };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1852,7 +1847,7 @@ pub fn motion_compensate<T: Pixel>(
         }
       }
     } else {
-      luma_mode.predict_inter(
+      luma_mode.predict_inter::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -1922,7 +1917,7 @@ pub fn encode_block_pre_cdef<T: Pixel, W: Writer>(
 ///
 /// - If chroma and luma do not match for inter modes
 /// - If an invalid motion vector is found
-pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
+pub fn encode_block_post_cdef<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   chroma_mode: PredictionMode, angle_delta: AngleDelta,
@@ -2184,10 +2179,10 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
   }
 
   if is_inter {
-    motion_compensate(
+    motion_compensate::<_, BD>(
       fi, ts, cw, luma_mode, ref_frames, mvs, bsize, tile_bo, false,
     );
-    write_tx_tree(
+    write_tx_tree::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2204,7 +2199,7 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
       need_recon_pixel,
     )
   } else {
-    write_tx_blocks(
+    write_tx_blocks::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2228,7 +2223,7 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
 /// # Panics
 ///
 /// - If attempting to encode a lossless block (not yet supported)
-pub fn write_tx_blocks<T: Pixel, W: Writer>(
+pub fn write_tx_blocks<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   chroma_mode: PredictionMode, angle_delta: AngleDelta,
@@ -2249,14 +2244,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   let do_chroma =
     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
 
-  ts.qc.update(
-    qidx,
-    tx_size,
-    luma_mode.is_intra(),
-    fi.sequence.bit_depth,
-    fi.dc_delta_q[0],
-    0,
-  );
+  ts.qc.update::<BD>(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0);
 
   for by in 0..bh {
     for bx in 0..bw {
@@ -2268,7 +2256,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
         continue;
       }
       let po = tx_bo.plane_offset(&ts.input.planes[0].cfg);
-      let (has_coeff, dist) = encode_tx_block(
+      let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2333,11 +2321,10 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   };
 
   for p in 1..3 {
-    ts.qc.update(
+    ts.qc.update::<BD>(
       qidx,
       uv_tx_size,
       true,
-      fi.sequence.bit_depth,
       fi.dc_delta_q[p],
       fi.ac_delta_q[p],
     );
@@ -2354,7 +2341,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
         let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg);
         po.x += (bx * uv_tx_size.width()) as isize;
         po.y += (by * uv_tx_size.height()) as isize;
-        let (has_coeff, dist) = encode_tx_block(
+        let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2389,7 +2376,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   (partition_has_coeff, tx_dist)
 }
 
-pub fn write_tx_tree<T: Pixel, W: Writer>(
+pub fn write_tx_tree<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   angle_delta_y: i8, tile_bo: TileBlockOffset, bsize: BlockSize,
@@ -2408,14 +2395,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   let mut partition_has_coeff: bool = false;
   let mut tx_dist = ScaledDistortion::zero();
 
-  ts.qc.update(
-    qidx,
-    tx_size,
-    luma_mode.is_intra(),
-    fi.sequence.bit_depth,
-    fi.dc_delta_q[0],
-    0,
-  );
+  ts.qc.update::<BD>(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0);
 
   // TODO: If tx-parition more than only 1-level, this code does not work.
   // It should recursively traverse the tx block that are split recursivelty by calling write_tx_tree(),
@@ -2431,7 +2411,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
       }
 
       let po = tx_bo.plane_offset(&ts.input.planes[0].cfg);
-      let (has_coeff, dist) = encode_tx_block(
+      let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2494,11 +2474,10 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   };
 
   for p in 1..3 {
-    ts.qc.update(
+    ts.qc.update::<BD>(
       qidx,
       uv_tx_size,
       false,
-      fi.sequence.bit_depth,
       fi.dc_delta_q[p],
       fi.ac_delta_q[p],
     );
@@ -2515,7 +2494,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
         let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg);
         po.x += (bx * uv_tx_size.width()) as isize;
         po.y += (by * uv_tx_size.height()) as isize;
-        let (has_coeff, dist) = encode_tx_block(
+        let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2546,7 +2525,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   (partition_has_coeff, tx_dist)
 }
 
-pub fn encode_block_with_modes<T: Pixel, W: Writer>(
+pub fn encode_block_with_modes<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -2572,7 +2551,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
 
   let (tx_size, tx_type) = if !mode_decision.skip && !mode_decision.has_coeff {
     skip = true;
-    rdo_tx_size_type(
+    rdo_tx_size_type::<_, BD>(
       fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip,
     )
   } else {
@@ -2588,7 +2567,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
     tile_bo,
     skip,
   );
-  encode_block_post_cdef(
+  encode_block_post_cdef::<_, _, BD>(
     fi,
     ts,
     cw,
@@ -2612,7 +2591,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
   );
 }
 
-fn encode_partition_bottomup<T: Pixel, W: Writer>(
+fn encode_partition_bottomup<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset, ref_rd_cost: f64,
@@ -2673,7 +2652,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
     };
 
     let mode_decision =
-      rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+      rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
 
     if !mode_decision.pred_mode_luma.is_intra() {
       // Fill the saved motion structure
@@ -2693,7 +2672,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
     rdo_output.part_modes.push(mode_decision.clone());
 
     if !can_split {
-      encode_block_with_modes(
+      encode_block_with_modes::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2783,7 +2762,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
         if offset.0.x >= ts.mi_width || offset.0.y >= ts.mi_height {
           continue;
         }
-        let child_rdo_output = encode_partition_bottomup(
+        let child_rdo_output = encode_partition_bottomup::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2856,7 +2835,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
         }
 
         // FIXME: redundant block re-encode
-        encode_block_with_modes(
+        encode_block_with_modes::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2895,7 +2874,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
   rdo_output
 }
 
-fn encode_partition_topdown<T: Pixel, W: Writer>(
+fn encode_partition_topdown<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -2939,7 +2918,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
     debug_assert!(bsize.is_sqr());
 
     // Blocks of sizes within the supported range are subjected to a partitioning decision
-    rdo_output = rdo_partition_decision(
+    rdo_output = rdo_partition_decision::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2977,7 +2956,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
       } else {
         // Make a prediction mode decision for blocks encoded with no rdo_partition_decision call (e.g. edges)
         rdo_decision =
-          rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+          rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
         &rdo_decision
       };
 
@@ -2997,7 +2976,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
       // NOTE: Cannot avoid calling rdo_tx_size_type() here again,
       // because, with top-down partition RDO, the neighboring contexts
       // of current partition can change, i.e. neighboring partitions can split down more.
-      let (tx_size, tx_type) = rdo_tx_size_type(
+      let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>(
         fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip,
       );
 
@@ -3105,7 +3084,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         tile_bo,
         skip,
       );
-      encode_block_post_cdef(
+      encode_block_post_cdef::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -3135,7 +3114,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         // The optimal prediction modes for each split block is known from an rdo_partition_decision() call
         for mode in rdo_output.part_modes {
           // Each block is subjected to a new splitting decision
-          encode_partition_topdown(
+          encode_partition_topdown::<_, _, BD>(
             fi,
             ts,
             cw,
@@ -3174,7 +3153,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         let partitions = get_sub_partitions(&four_partitions, partition);
 
         partitions.iter().for_each(|&offset| {
-          encode_partition_topdown(
+          encode_partition_topdown::<_, _, BD>(
             fi,
             ts,
             cw,
@@ -3215,7 +3194,7 @@ fn get_initial_cdfcontext<T: Pixel>(fi: &FrameInvariants<T>) -> CDFContext {
 }
 
 #[hawktracer(encode_tile_group)]
-fn encode_tile_group<T: Pixel>(
+fn encode_tile_group<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) -> Vec<u8> {
   let planes =
@@ -3233,7 +3212,7 @@ fn encode_tile_group<T: Pixel>(
     .collect::<Vec<_>>()
     .into_par_iter()
     .map(|(mut ctx, cdf)| {
-      encode_tile(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg)
+      encode_tile::<_, BD>(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg)
     })
     .unzip();
 
@@ -3247,7 +3226,7 @@ fn encode_tile_group<T: Pixel>(
   /* TODO: Don't apply if lossless */
   let levels = fs.apply_tile_state_mut(|ts| {
     let rec = &mut ts.rec;
-    deblock_filter_optimize(
+    deblock_filter_optimize::<_, _, BD>(
       fi,
       &rec.as_const(),
       &ts.input.as_tile(),
@@ -3261,13 +3240,12 @@ fn encode_tile_group<T: Pixel>(
   if fs.deblock.levels[0] != 0 || fs.deblock.levels[1] != 0 {
     fs.apply_tile_state_mut(|ts| {
       let rec = &mut ts.rec;
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         ts.deblock,
         rec,
         &blocks.as_tile_blocks(),
         fi.width,
         fi.height,
-        fi.sequence.bit_depth,
         planes,
       );
     });
@@ -3282,11 +3260,16 @@ fn encode_tile_group<T: Pixel>(
     if fi.sequence.enable_cdef {
       fs.apply_tile_state_mut(|ts| {
         let rec = &mut ts.rec;
-        cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec);
+        cdef_filter_tile::<_, BD>(
+          fi,
+          &deblocked_frame,
+          &blocks.as_tile_blocks(),
+          rec,
+        );
       });
     }
     /* TODO: Don't apply if lossless */
-    fs.restoration.lrf_filter_frame(
+    fs.restoration.lrf_filter_frame::<_, BD>(
       Arc::get_mut(&mut fs.rec).unwrap(),
       &deblocked_frame,
       fi,
@@ -3297,7 +3280,12 @@ fn encode_tile_group<T: Pixel>(
       let deblocked_frame = (*fs.rec).clone();
       fs.apply_tile_state_mut(|ts| {
         let rec = &mut ts.rec;
-        cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec);
+        cdef_filter_tile::<_, BD>(
+          fi,
+          &deblocked_frame,
+          &blocks.as_tile_blocks(),
+          rec,
+        );
       });
     }
   }
@@ -3353,7 +3341,7 @@ pub struct SBSQueueEntry {
   pub w_post_cdef: WriterBase<WriterRecorder>,
 }
 
-fn check_lf_queue<T: Pixel>(
+fn check_lf_queue<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut WriterBase<WriterEncoder>,
   sbs_q: &mut VecDeque<SBSQueueEntry>, last_lru_ready: &mut [i32; 3],
@@ -3405,7 +3393,7 @@ fn check_lf_queue<T: Pixel>(
             }
           }
           if !already_rdoed {
-            rdo_loop_decision(qe.sbo, fi, ts, cw, w, deblock_p);
+            rdo_loop_decision::<_, _, BD>(qe.sbo, fi, ts, cw, w, deblock_p);
             for pli in 0..planes {
               if qe.lru_index[pli] != -1
                 && last_lru_rdoed[pli] < qe.lru_index[pli]
@@ -3445,7 +3433,7 @@ fn check_lf_queue<T: Pixel>(
 }
 
 #[hawktracer(encode_tile)]
-fn encode_tile<'a, T: Pixel>(
+fn encode_tile<'a, T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &'a mut TileStateMut<'_, T>,
   fc: &'a mut CDFContext, blocks: &'a mut TileBlocksMut<'a>,
   inter_cfg: &InterConfig,
@@ -3492,7 +3480,7 @@ fn encode_tile<'a, T: Pixel>(
         || is_straddle_sbx
         || is_straddle_sby
       {
-        encode_partition_bottomup(
+        encode_partition_bottomup::<_, _, BD>(
           fi,
           ts,
           &mut cw,
@@ -3505,7 +3493,7 @@ fn encode_tile<'a, T: Pixel>(
           &mut enc_stats,
         );
       } else {
-        encode_partition_topdown(
+        encode_partition_topdown::<_, _, BD>(
           fi,
           ts,
           &mut cw,
@@ -3547,7 +3535,7 @@ fn encode_tile<'a, T: Pixel>(
         sbs_q.push_back(sbs_qe);
 
         if check_queue && !fi.sequence.enable_delayed_loopfilter_rdo {
-          check_lf_queue(
+          check_lf_queue::<_, BD>(
             fi,
             ts,
             &mut cw,
@@ -3566,7 +3554,7 @@ fn encode_tile<'a, T: Pixel>(
   if fi.sequence.enable_delayed_loopfilter_rdo {
     // Solve deblocking for just this tile
     /* TODO: Don't apply if lossless */
-    let deblock_levels = deblock_filter_optimize(
+    let deblock_levels = deblock_filter_optimize::<_, _, BD>(
       fi,
       &ts.rec.as_const(),
       &ts.input_tile,
@@ -3592,18 +3580,17 @@ fn encode_tile<'a, T: Pixel>(
       deblock_copy.levels = deblock_levels;
 
       // temporarily deblock the reference
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         &deblock_copy,
         &mut ts.rec,
         &cw.bc.blocks.as_const(),
         fi.width,
         fi.height,
-        fi.sequence.bit_depth,
         planes,
       );
 
       // rdo lf and write
-      check_lf_queue(
+      check_lf_queue::<_, BD>(
         fi,
         ts,
         &mut cw,
@@ -3627,7 +3614,7 @@ fn encode_tile<'a, T: Pixel>(
       }
     } else {
       // rdo lf and write
-      check_lf_queue(
+      check_lf_queue::<_, BD>(
         fi,
         ts,
         &mut cw,
@@ -3743,7 +3730,7 @@ fn get_initial_segmentation<T: Pixel>(
 /// # Panics
 ///
 /// - If the frame packets cannot be written
-pub fn encode_frame<T: Pixel>(
+pub fn encode_frame<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) -> Vec<u8> {
   debug_assert!(!fi.is_show_existing_frame());
@@ -3753,9 +3740,9 @@ pub fn encode_frame<T: Pixel>(
 
   if fi.enable_segmentation {
     fs.segmentation = get_initial_segmentation(fi);
-    segmentation_optimize(fi, fs);
+    segmentation_optimize::<_, BD>(fi, fs);
   }
-  let tile_group = encode_tile_group(fi, fs, inter_cfg);
+  let tile_group = encode_tile_group::<_, BD>(fi, fs, inter_cfg);
 
   if fi.frame_type == FrameType::KEY {
     write_key_frame_obus(&mut packet, fi, obu_extension).unwrap();
diff --git a/src/lrf.rs b/src/lrf.rs
index f33a48826e..c793310498 100644
--- a/src/lrf.rs
+++ b/src/lrf.rs
@@ -626,7 +626,7 @@ pub fn setup_integral_image<T: Pixel>(
   }
 }
 
-pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
+pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel, const BD: usize>(
   set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
   integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
   cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
@@ -647,19 +647,6 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
   let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
 
-  let fn_ab_r1 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r1::<8>,
-    10 => sgrproj_box_ab_r1::<10>,
-    12 => sgrproj_box_ab_r1::<12>,
-    _ => unimplemented!(),
-  };
-  let fn_ab_r2 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r2::<8>,
-    10 => sgrproj_box_ab_r2::<10>,
-    12 => sgrproj_box_ab_r2::<12>,
-    _ => unimplemented!(),
-  };
-
   /* prime the intermediate arrays */
   // One oddness about the radius=2 intermediate array computations that
   // the spec doesn't make clear: Although the spec defines computation
@@ -668,7 +655,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   let integral_image = &integral_image_buffer.integral_image;
   let sq_integral_image = &integral_image_buffer.sq_integral_image;
   if s_r2 > 0 {
-    fn_ab_r2(
+    sgrproj_box_ab_r2::<BD>(
       &mut a_r2[0],
       &mut b_r2[0],
       integral_image,
@@ -682,7 +669,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   }
   if s_r1 > 0 {
     let integral_image_offset = integral_image_stride + 1;
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[0],
       &mut b_r1[0],
       &integral_image[integral_image_offset..],
@@ -693,7 +680,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       s_r1,
       fi.cpu_feature_level,
     );
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[1],
       &mut b_r1[1],
       &integral_image[integral_image_offset..],
@@ -712,7 +699,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   for y in (0..stripe_h).step_by(2) {
     // get results to use y and y+1
     let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
-      fn_ab_r2(
+      sgrproj_box_ab_r2::<BD>(
         &mut a_r2[(y / 2 + 1) % 2],
         &mut b_r2[(y / 2 + 1) % 2],
         integral_image,
@@ -751,7 +738,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       let y = y + dy;
       if s_r1 > 0 {
         let integral_image_offset = integral_image_stride + 1;
-        fn_ab_r1(
+        sgrproj_box_ab_r1::<BD>(
           &mut a_r1[(y + 2) % 3],
           &mut b_r1[(y + 2) % 3],
           &integral_image[integral_image_offset..],
@@ -793,9 +780,9 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       let line = &cdeffed[y];
 
       #[inline(always)]
-      fn apply_filter<U: Pixel>(
+      fn apply_filter<U: Pixel, const BD: usize>(
         out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
-        stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
+        stripe_w: usize, w0: i32, w1: i32, w2: i32,
       ) {
         let line_it = line[..stripe_w].iter();
         let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
@@ -809,17 +796,16 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
           let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
           let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
             >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
-          *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
+          *o = U::cast_from(clamp(s, 0, (1 << BD) - 1));
         }
       }
 
-      apply_filter(
+      apply_filter::<_, BD>(
         &mut out[y],
         line,
         &f_r1,
         f_r2_ab[dy],
         stripe_w,
-        fi.sequence.bit_depth,
         w0,
         w1,
         w2,
@@ -842,7 +828,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
 
 // Input params follow the same rules as sgrproj_stripe_filter.
 // Inputs are relative to the colocated slice views.
-pub fn sgrproj_solve<T: Pixel>(
+pub fn sgrproj_solve<T: Pixel, const BD: usize>(
   set: u8, fi: &FrameInvariants<T>,
   integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
   cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
@@ -865,19 +851,6 @@ pub fn sgrproj_solve<T: Pixel>(
   let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
   let mut c: [f64; 2] = [0., 0.];
 
-  let fn_ab_r1 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r1::<8>,
-    10 => sgrproj_box_ab_r1::<10>,
-    12 => sgrproj_box_ab_r1::<12>,
-    _ => unimplemented!(),
-  };
-  let fn_ab_r2 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r2::<8>,
-    10 => sgrproj_box_ab_r2::<10>,
-    12 => sgrproj_box_ab_r2::<12>,
-    _ => unimplemented!(),
-  };
-
   /* prime the intermediate arrays */
   // One oddness about the radius=2 intermediate array computations that
   // the spec doesn't make clear: Although the spec defines computation
@@ -886,7 +859,7 @@ pub fn sgrproj_solve<T: Pixel>(
   let integral_image = &integral_image_buffer.integral_image;
   let sq_integral_image = &integral_image_buffer.sq_integral_image;
   if s_r2 > 0 {
-    fn_ab_r2(
+    sgrproj_box_ab_r2::<BD>(
       &mut a_r2[0],
       &mut b_r2[0],
       integral_image,
@@ -900,7 +873,7 @@ pub fn sgrproj_solve<T: Pixel>(
   }
   if s_r1 > 0 {
     let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[0],
       &mut b_r1[0],
       &integral_image[integral_image_offset..],
@@ -911,7 +884,7 @@ pub fn sgrproj_solve<T: Pixel>(
       s_r1,
       fi.cpu_feature_level,
     );
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[1],
       &mut b_r1[1],
       &integral_image[integral_image_offset..],
@@ -930,7 +903,7 @@ pub fn sgrproj_solve<T: Pixel>(
   for y in (0..cdef_h).step_by(2) {
     // get results to use y and y+1
     let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
-      fn_ab_r2(
+      sgrproj_box_ab_r2::<BD>(
         &mut a_r2[(y / 2 + 1) % 2],
         &mut b_r2[(y / 2 + 1) % 2],
         integral_image,
@@ -963,7 +936,7 @@ pub fn sgrproj_solve<T: Pixel>(
       let y = y + dy;
       if s_r1 > 0 {
         let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
-        fn_ab_r1(
+        sgrproj_box_ab_r1::<BD>(
           &mut a_r1[(y + 2) % 3],
           &mut b_r1[(y + 2) % 3],
           &integral_image[integral_image_offset..],
@@ -1093,16 +1066,15 @@ pub fn sgrproj_solve<T: Pixel>(
   }
 }
 
-fn wiener_stripe_filter<T: Pixel>(
-  coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
-  stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
-  cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
+fn wiener_stripe_filter<T: Pixel, const BD: usize>(
+  coeffs: [[i8; 3]; 2], crop_w: usize, crop_h: usize, stripe_w: usize,
+  stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane<T>,
+  deblocked: &Plane<T>, out: &mut Plane<T>,
 ) {
-  let bit_depth = fi.sequence.bit_depth;
-  let round_h = if bit_depth == 12 { 5 } else { 3 };
-  let round_v = if bit_depth == 12 { 9 } else { 11 };
-  let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
-  let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
+  let round_h = if BD == 12 { 5 } else { 3 };
+  let round_v = if BD == 12 { 9 } else { 11 };
+  let offset = 1 << (BD + WIENER_BITS - round_h - 1);
+  let limit = (1 << (BD + 1 + WIENER_BITS - round_h)) - 1;
 
   let mut coeffs_ = [[0; 3]; 2];
   for i in 0..2 {
@@ -1197,7 +1169,7 @@ fn wiener_stripe_filter<T: Pixel>(
       *dst = T::cast_from(clamp(
         (acc + (1 << round_v >> 1)) >> round_v,
         0,
-        (1 << bit_depth) - 1,
+        (1 << BD) - 1,
       ));
     }
   }
@@ -1482,7 +1454,7 @@ impl RestorationState {
   }
 
   #[hawktracer(lrf_filter_frame)]
-  pub fn lrf_filter_frame<T: Pixel>(
+  pub fn lrf_filter_frame<T: Pixel, const BD: usize>(
     &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
     fi: &FrameInvariants<T>,
   ) {
@@ -1530,9 +1502,8 @@ impl RestorationState {
           let ru = rp.restoration_unit_by_stripe(si, rux);
           match ru.filter {
             RestorationFilter::Wiener { coeffs } => {
-              wiener_stripe_filter(
+              wiener_stripe_filter::<_, BD>(
                 coeffs,
-                fi,
                 crop_w,
                 crop_h,
                 size,
@@ -1562,7 +1533,7 @@ impl RestorationState {
                   .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
               );
 
-              sgrproj_stripe_filter(
+              sgrproj_stripe_filter::<_, _, BD>(
                 set,
                 xqd,
                 fi,
diff --git a/src/mc.rs b/src/mc.rs
index d9edde259b..45981cc6c9 100644
--- a/src/mc.rs
+++ b/src/mc.rs
@@ -247,10 +247,10 @@ pub(crate) mod rust {
   }
 
   #[cold_for_target_arch("x86_64")]
-  pub fn put_8tap<T: Pixel>(
+  pub fn put_8tap<T: Pixel, const BD: usize>(
     dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize,
     height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
-    mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel,
+    mode_y: FilterMode, _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
@@ -259,8 +259,8 @@ pub(crate) mod rust {
     let ref_stride = src.plane.cfg.stride;
     let y_filter = get_filter(mode_y, row_frac, height);
     let x_filter = get_filter(mode_x, col_frac, width);
-    let max_sample_val = (1 << bit_depth) - 1;
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
+    let max_sample_val = (1 << BD) - 1;
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
     match (col_frac, row_frac) {
       (0, 0) => {
         for r in 0..height {
@@ -357,10 +357,10 @@ pub(crate) mod rust {
   const PREP_BIAS: i32 = 8192;
 
   #[cold_for_target_arch("x86_64")]
-  pub fn prep_8tap<T: Pixel>(
+  pub fn prep_8tap<T: Pixel, const BD: usize>(
     tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize,
     col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
-    bit_depth: usize, _cpu: CpuFeatureLevel,
+    _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
@@ -369,8 +369,8 @@ pub(crate) mod rust {
     let ref_stride = src.plane.cfg.stride;
     let y_filter = get_filter(mode_y, row_frac, height);
     let x_filter = get_filter(mode_x, col_frac, width);
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
-    let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS };
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
+    let prep_bias = if BD == 8 { 0 } else { PREP_BIAS };
     match (col_frac, row_frac) {
       (0, 0) => {
         for r in 0..height {
@@ -451,17 +451,17 @@ pub(crate) mod rust {
   }
 
   #[cold_for_target_arch("x86_64")]
-  pub fn mc_avg<T: Pixel>(
+  pub fn mc_avg<T: Pixel, const BD: usize>(
     dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize,
-    height: usize, bit_depth: usize, _cpu: CpuFeatureLevel,
+    height: usize, _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
     assert!(width.is_power_of_two() && (2..=128).contains(&width));
 
-    let max_sample_val = (1 << bit_depth) - 1;
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
-    let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 };
+    let max_sample_val = (1 << BD) - 1;
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
+    let prep_bias = if BD == 8 { 0 } else { PREP_BIAS * 2 };
     for r in 0..height {
       let dst_slice = &mut dst[r];
       for c in 0..width {
diff --git a/src/me.rs b/src/me.rs
index a6b09e9f03..ecbcdfa398 100644
--- a/src/me.rs
+++ b/src/me.rs
@@ -154,7 +154,7 @@ pub enum MVSamplingMode {
 }
 
 #[hawktracer(estimate_tile_motion)]
-pub fn estimate_tile_motion<T: Pixel>(
+pub fn estimate_tile_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   inter_cfg: &InterConfig,
 ) {
@@ -194,7 +194,7 @@ pub fn estimate_tile_motion<T: Pixel>(
               .block_offset(0, 0);
 
           if new_subsampling {
-            refine_subsampled_sb_motion(
+            refine_subsampled_sb_motion::<_, BD>(
               fi,
               ts,
               ref_frame,
@@ -205,7 +205,7 @@ pub fn estimate_tile_motion<T: Pixel>(
             );
           }
 
-          estimate_sb_motion(
+          estimate_sb_motion::<_, BD>(
             fi,
             ts,
             ref_frame,
@@ -221,7 +221,7 @@ pub fn estimate_tile_motion<T: Pixel>(
   }
 }
 
-fn estimate_sb_motion<T: Pixel>(
+fn estimate_sb_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType,
   mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8,
   lambda: u32,
@@ -257,7 +257,7 @@ fn estimate_sb_motion<T: Pixel>(
       // Run motion estimation.
       // Note that the initial search (init) instructs the called function to
       // perform a more extensive search.
-      if let Some(results) = estimate_motion(
+      if let Some(results) = estimate_motion::<_, BD>(
         fi,
         ts,
         w,
@@ -285,7 +285,7 @@ fn estimate_sb_motion<T: Pixel>(
   }
 }
 
-fn refine_subsampled_sb_motion<T: Pixel>(
+fn refine_subsampled_sb_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType,
   mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32,
 ) {
@@ -307,7 +307,7 @@ fn refine_subsampled_sb_motion<T: Pixel>(
       let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec;
 
       // Refine the existing motion estimate
-      if let Some(results) = refine_subsampled_motion_estimate(
+      if let Some(results) = refine_subsampled_motion_estimate::<_, BD>(
         fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda,
       ) {
         // normalize sad to 128x128 block
@@ -536,7 +536,7 @@ fn get_subset_predictors(
   MotionEstimationSubsets { min_sad, median, subset_b, subset_c }
 }
 
-pub fn estimate_motion<T: Pixel>(
+pub fn estimate_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize,
   tile_bo: TileBlockOffset, ref_frame: RefType,
   pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode,
@@ -575,7 +575,7 @@ pub fn estimate_motion<T: Pixel>(
       _ => unimplemented!(),
     };
 
-    let mut best: MotionSearchResult = full_pixel_me(
+    let mut best: MotionSearchResult = full_pixel_me::<_, BD>(
       fi,
       ts,
       org_region,
@@ -599,26 +599,13 @@ pub fn estimate_motion<T: Pixel>(
     if let Some(pmv) = pmv {
       let use_satd: bool = fi.config.speed_settings.motion.use_satd_subpel;
       if use_satd {
-        best.rd = get_fullpel_mv_rd(
-          fi,
-          po,
-          org_region,
-          p_ref,
-          fi.sequence.bit_depth,
-          pmv,
-          lambda,
-          use_satd,
-          mvx_min,
-          mvx_max,
-          mvy_min,
-          mvy_max,
-          w,
-          h,
-          best.mv,
+        best.rd = get_fullpel_mv_rd::<_, BD>(
+          fi, po, org_region, p_ref, pmv, lambda, use_satd, mvx_min, mvx_max,
+          mvy_min, mvy_max, w, h, best.mv,
         );
       }
 
-      sub_pixel_me(
+      sub_pixel_me::<_, BD>(
         fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min,
         mvy_max, w, h, use_satd, &mut best, ref_frame,
       );
@@ -634,7 +621,7 @@ pub fn estimate_motion<T: Pixel>(
 }
 
 /// Refine motion estimation that was computed one level of subsampling up.
-fn refine_subsampled_motion_estimate<T: Pixel>(
+fn refine_subsampled_motion_estimate<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize,
   tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32,
 ) -> Option<MotionSearchResult> {
@@ -679,7 +666,7 @@ fn refine_subsampled_motion_estimate<T: Pixel>(
     let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8);
     let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8);
     let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8);
-    let mut results = full_search(
+    let mut results = full_search::<_, BD>(
       fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv,
     );
 
@@ -692,7 +679,7 @@ fn refine_subsampled_motion_estimate<T: Pixel>(
   }
 }
 
-fn full_pixel_me<T: Pixel>(
+fn full_pixel_me<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
   org_region: &PlaneRegion<T>, p_ref: &Plane<T>, tile_bo: TileBlockOffset,
   po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize,
@@ -722,29 +709,16 @@ fn full_pixel_me<T: Pixel>(
 
   let try_cands = |predictors: &[MotionVector],
                    best: &mut MotionSearchResult| {
-    let mut results = get_best_predictor(
-      fi,
-      po,
-      org_region,
-      p_ref,
-      predictors,
-      fi.sequence.bit_depth,
-      pmv,
-      lambda,
-      mvx_min,
-      mvx_max,
-      mvy_min,
-      mvy_max,
-      w,
-      h,
+    let mut results = get_best_predictor::<_, BD>(
+      fi, po, org_region, p_ref, predictors, pmv, lambda, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h,
     );
-    fullpel_diamond_search(
+    fullpel_diamond_search::<_, BD>(
       fi,
       po,
       org_region,
       p_ref,
       &mut results,
-      fi.sequence.bit_depth,
       pmv,
       lambda,
       mvx_min,
@@ -770,8 +744,8 @@ fn full_pixel_me<T: Pixel>(
     // from the previous frame. Stop once a candidate with a sad less than a
     // threshold is found.
 
-    let thresh = (subsets.min_sad as f32 * 1.2) as u32
-      + (((w * h) as u32) << (fi.sequence.bit_depth - 8));
+    let thresh =
+      (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (BD - 8));
 
     if let Some(median) = subsets.median {
       try_cands(&[median], &mut best);
@@ -795,21 +769,9 @@ fn full_pixel_me<T: Pixel>(
 
     // Preform UMH search, either as the last possible search when full search
     // is disabled, or as the last search before resorting to full search.
-    uneven_multi_hex_search(
-      fi,
-      po,
-      org_region,
-      p_ref,
-      &mut best,
-      fi.sequence.bit_depth,
-      pmv,
-      lambda,
-      mvx_min,
-      mvx_max,
-      mvy_min,
-      mvy_max,
-      w,
-      h,
+    uneven_multi_hex_search::<_, BD>(
+      fi, po, org_region, p_ref, &mut best, pmv, lambda, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h,
       // Use 24, since it is the largest range that x264 uses.
       24,
     );
@@ -829,7 +791,7 @@ fn full_pixel_me<T: Pixel>(
       let y_lo = po.y + (-range_y).max(mvy_min / 8);
       let y_hi = po.y + (range_y).min(mvy_max / 8);
 
-      let results = full_search(
+      let results = full_search::<_, BD>(
         fi,
         x_lo,
         x_hi,
@@ -857,44 +819,30 @@ fn full_pixel_me<T: Pixel>(
   }
 }
 
-fn sub_pixel_me<T: Pixel>(
+fn sub_pixel_me<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
   p_ref: &Plane<T>, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize,
   mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
   use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType,
 ) {
-  subpel_diamond_search(
-    fi,
-    po,
-    org_region,
-    p_ref,
-    fi.sequence.bit_depth,
-    pmv,
-    lambda,
-    mvx_min,
-    mvx_max,
-    mvy_min,
-    mvy_max,
-    w,
-    h,
-    use_satd,
-    best,
-    ref_frame,
+  subpel_diamond_search::<_, BD>(
+    fi, po, org_region, p_ref, pmv, lambda, mvx_min, mvx_max, mvy_min,
+    mvy_max, w, h, use_satd, best, ref_frame,
   );
 }
 
-fn get_best_predictor<T: Pixel>(
+fn get_best_predictor<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, predictors: &[MotionVector], bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, predictors: &[MotionVector], pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) -> MotionSearchResult {
   let mut best: MotionSearchResult = MotionSearchResult::empty();
 
   for &init_mv in predictors.iter() {
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, init_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, init_mv,
     );
 
     if rd.cost < best.rd.cost {
@@ -953,11 +901,11 @@ const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!(
 /// For each step size, candidate motion vectors are examined for improvement
 /// to the current search location. The search location is moved to the best
 /// candidate (if any). This is repeated until the search location stops moving.
-fn fullpel_diamond_search<T: Pixel>(
+fn fullpel_diamond_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) {
   // Define the initial and the final scale (log2) of the diamond.
   let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8);
@@ -967,9 +915,9 @@ fn fullpel_diamond_search<T: Pixel>(
     let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
     for &offset in &DIAMOND_R1_PATTERN {
       let cand_mv = current.mv + (offset << diamond_radius_log2);
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < best_cand.rd.cost {
@@ -1052,11 +1000,11 @@ const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!(
 ///
 /// `current` provides the initial search location and serves as
 /// the output for the final search results.
-fn hexagon_search<T: Pixel>(
+fn hexagon_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) {
   // The first iteration of hexagon search is implemented separate from
   // subsequent iterations, which overlap with previous iterations.
@@ -1070,9 +1018,9 @@ fn hexagon_search<T: Pixel>(
   // First iteration of hexagon search. There are six candidates to consider.
   for i in 0..6 {
     let cand_mv = current.mv + HEXAGON_PATTERN[i];
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, cand_mv,
     );
 
     if rd.cost < best_cand.rd.cost {
@@ -1102,9 +1050,9 @@ fn hexagon_search<T: Pixel>(
       let i = (center_cand_idx + idx_offset_mod6) % 6;
       let cand_mv = current.mv + HEXAGON_PATTERN[i];
 
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < best_cand.rd.cost {
@@ -1119,9 +1067,9 @@ fn hexagon_search<T: Pixel>(
   let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
   for &offset in &SQUARE_REFINE_PATTERN {
     let cand_mv = current.mv + offset;
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, cand_mv,
     );
 
     if rd.cost < best_cand.rd.cost {
@@ -1166,11 +1114,11 @@ const UMH_PATTERN: [MotionVector; 16] = search_pattern!(
 /// the output for the final search results.
 ///
 /// `me_range` parameter determines how far these stages can search.
-fn uneven_multi_hex_search<T: Pixel>(
+fn uneven_multi_hex_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize, me_range: i16,
 ) {
   assert!(!current.is_empty());
 
@@ -1199,9 +1147,9 @@ fn uneven_multi_hex_search<T: Pixel>(
 
     for &offset in &HORIZONTAL_LINE {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1220,9 +1168,9 @@ fn uneven_multi_hex_search<T: Pixel>(
 
     for &offset in &VERTICAL_LINE {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1240,9 +1188,9 @@ fn uneven_multi_hex_search<T: Pixel>(
         continue;
       }
       let cand_mv = center + MotionVector { row, col };
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1282,9 +1230,9 @@ fn uneven_multi_hex_search<T: Pixel>(
   for i in 1..=iterations {
     for &offset in &UMH_PATTERN {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1295,9 +1243,9 @@ fn uneven_multi_hex_search<T: Pixel>(
   }
 
   // Refine the search results using a 'normal' hexagon search.
-  hexagon_search(
-    fi, po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min,
-    mvx_max, mvy_min, mvy_max, w, h,
+  hexagon_search::<_, BD>(
+    fi, po, org_region, p_ref, current, pmv, lambda, mvx_min, mvx_max,
+    mvy_min, mvy_max, w, h,
   );
 }
 
@@ -1306,12 +1254,11 @@ fn uneven_multi_hex_search<T: Pixel>(
 /// For each step size, candidate motion vectors are examined for improvement
 /// to the current search location. The search location is moved to the best
 /// candidate (if any). This is repeated until the search location stops moving.
-fn subpel_diamond_search<T: Pixel>(
+fn subpel_diamond_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  _p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32,
-  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
-  h: usize, use_satd: bool, current: &mut MotionSearchResult,
-  ref_frame: RefType,
+  _p_ref: &Plane<T>, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize,
+  mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  use_satd: bool, current: &mut MotionSearchResult, ref_frame: RefType,
 ) {
   use crate::util::Aligned;
 
@@ -1340,11 +1287,10 @@ fn subpel_diamond_search<T: Pixel>(
     for &offset in &DIAMOND_R1_PATTERN_SUBPEL {
       let cand_mv = current.mv + (offset << diamond_radius_log2);
 
-      let rd = get_subpel_mv_rd(
+      let rd = get_subpel_mv_rd::<_, BD>(
         fi,
         po,
         org_region,
-        bit_depth,
         pmv,
         lambda,
         use_satd,
@@ -1381,11 +1327,11 @@ fn subpel_diamond_search<T: Pixel>(
 }
 
 #[inline]
-fn get_fullpel_mv_rd<T: Pixel>(
+fn get_fullpel_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32,
-  use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize,
-  mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector,
+  p_ref: &Plane<T>, pmv: [MotionVector; 2], lambda: u32, use_satd: bool,
+  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
+  h: usize, cand_mv: MotionVector,
 ) -> MVCandidateRD {
   if (cand_mv.col as isize) < mvx_min
     || (cand_mv.col as isize) > mvx_max
@@ -1400,17 +1346,16 @@ fn get_fullpel_mv_rd<T: Pixel>(
     x: po.x + (cand_mv.col / 8) as isize,
     y: po.y + (cand_mv.row / 8) as isize,
   });
-  compute_mv_rd(
-    fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region,
-    &plane_ref,
+  compute_mv_rd::<_, BD>(
+    fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref,
   )
 }
 
-fn get_subpel_mv_rd<T: Pixel>(
+fn get_subpel_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool,
-  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
-  h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut<T>,
+  pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize,
+  mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut<T>,
   ref_frame: RefType,
 ) -> MVCandidateRD {
   if (cand_mv.col as isize) < mvx_min
@@ -1426,29 +1371,28 @@ fn get_subpel_mv_rd<T: Pixel>(
   let tile_rect =
     TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height };
 
-  PredictionMode::NEWMV.predict_inter_single(
+  PredictionMode::NEWMV.predict_inter_single::<_, BD>(
     fi, tile_rect, 0, po, tmp_region,
     // motion comp's w & h on edges can be different than distortion's
     tmp_width, tmp_height, ref_frame, cand_mv,
   );
   let plane_ref = tmp_region.as_const();
-  compute_mv_rd(
-    fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region,
-    &plane_ref,
+  compute_mv_rd::<_, BD>(
+    fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref,
   )
 }
 
 /// Compute the rate distortion stats for a motion vector.
 #[inline(always)]
-fn compute_mv_rd<T: Pixel>(
+fn compute_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, pmv: [MotionVector; 2], lambda: u32,
-  use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector,
+  use_satd: bool, w: usize, h: usize, cand_mv: MotionVector,
   plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>,
 ) -> MVCandidateRD {
   let sad = if use_satd {
-    get_satd(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level)
+    get_satd::<_, BD>(plane_org, plane_ref, w, h, fi.cpu_feature_level)
   } else {
-    get_sad(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level)
+    get_sad(plane_org, plane_ref, w, h, fi.cpu_feature_level)
   };
 
   let rate1 = get_mv_rate(cand_mv, pmv[0], fi.allow_high_precision_mv);
@@ -1458,7 +1402,7 @@ fn compute_mv_rd<T: Pixel>(
   MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad }
 }
 
-fn full_search<T: Pixel>(
+fn full_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize,
   w: usize, h: usize, org_region: &PlaneRegion<T>, p_ref: &Plane<T>,
   po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2],
@@ -1482,12 +1426,11 @@ fn full_search<T: Pixel>(
         col: 8 * (x as i16 - po.x as i16),
       };
 
-      let rd = compute_mv_rd(
+      let rd = compute_mv_rd::<_, BD>(
         fi,
         pmv,
         lambda,
         false,
-        fi.sequence.bit_depth,
         w,
         h,
         mv,
diff --git a/src/partition.rs b/src/partition.rs
index e64de6e9ad..22b800b646 100644
--- a/src/partition.rs
+++ b/src/partition.rs
@@ -591,7 +591,7 @@ fn supersample_chroma_bsize(
   }
 }
 
-pub fn get_intra_edges<T: Pixel>(
+pub fn get_intra_edges<T: Pixel, const BD: usize>(
   dst: &PlaneRegion<'_, T>,
   partition_bo: TileBlockOffset, // partition bo, BlockOffset
   bx: usize,
@@ -599,7 +599,6 @@ pub fn get_intra_edges<T: Pixel>(
   partition_size: BlockSize, // partition size, BlockSize
   po: PlaneOffset,
   tx_size: TxSize,
-  bit_depth: usize,
   opt_mode: Option<PredictionMode>,
   enable_intra_edge_filter: bool,
   intra_param: IntraParam,
@@ -610,7 +609,7 @@ pub fn get_intra_edges<T: Pixel>(
   let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> =
     unsafe { Aligned::uninitialized() };
   //Aligned::new([T::cast_from(0); 4 * MAX_TX_SIZE + 1]);
-  let base = 128u16 << (bit_depth - 8);
+  let base = 128u16 << (BD - 8);
 
   {
     // left pixels are ordered from bottom to top and right-aligned
diff --git a/src/predict.rs b/src/predict.rs
index 632196c72a..1de1b7f8bc 100644
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -205,9 +205,9 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an inter `PredictionMode`
-  pub fn predict_intra<T: Pixel>(
+  pub fn predict_intra<T: Pixel, const BD: usize>(
     self, tile_rect: TileRect, dst: &mut PlaneRegionMut<'_, T>,
-    tx_size: TxSize, bit_depth: usize, ac: &[i16], intra_param: IntraParam,
+    tx_size: TxSize, ac: &[i16], intra_param: IntraParam,
     ief_params: Option<IntraEdgeFilterParameters>,
     edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel,
   ) {
@@ -245,9 +245,8 @@ impl PredictionMode {
       _ => intra_mode_to_angle(mode) + (angle_delta * ANGLE_STEP) as isize,
     };
 
-    dispatch_predict_intra::<T>(
-      mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf,
-      cpu,
+    dispatch_predict_intra::<T, BD>(
+      mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu,
     );
   }
 
@@ -304,7 +303,7 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an intra `PredictionMode`
-  pub fn predict_inter_single<T: Pixel>(
+  pub fn predict_inter_single<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frame: RefType, mv: MotionVector,
@@ -319,7 +318,7 @@ impl PredictionMode {
     {
       let (row_frac, col_frac, src) =
         PredictionMode::get_mv_params(&rec.frame.planes[p], frame_po, mv);
-      put_8tap(
+      put_8tap::<_, BD>(
         dst,
         src,
         width,
@@ -328,7 +327,6 @@ impl PredictionMode {
         row_frac,
         mode,
         mode,
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       );
     }
@@ -339,7 +337,7 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an intra `PredictionMode`
-  pub fn predict_inter_compound<T: Pixel>(
+  pub fn predict_inter_compound<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -359,7 +357,7 @@ impl PredictionMode {
           frame_po,
           mvs[i],
         );
-        prep_8tap(
+        prep_8tap::<_, BD>(
           buffer.get_buffer_mut(i),
           src,
           width,
@@ -368,25 +366,23 @@ impl PredictionMode {
           row_frac,
           mode,
           mode,
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         );
       }
     }
-    mc_avg(
+    mc_avg::<_, BD>(
       dst,
       buffer.get_buffer(0),
       buffer.get_buffer(1),
       width,
       height,
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     );
   }
 
   /// Inter prediction that determines whether compound mode is being used based
   /// on the second [`RefType`] in [`ref_frames`].
-  pub fn predict_inter<T: Pixel>(
+  pub fn predict_inter<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -396,7 +392,7 @@ impl PredictionMode {
       && ref_frames[1] != RefType::NONE_FRAME;
 
     if !is_compound {
-      self.predict_inter_single(
+      self.predict_inter_single::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -408,7 +404,7 @@ impl PredictionMode {
         mvs[0],
       )
     } else {
-      self.predict_inter_compound(
+      self.predict_inter_compound::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -698,10 +694,10 @@ pub(crate) mod rust {
   use std::mem::size_of;
 
   #[inline(always)]
-  pub fn dispatch_predict_intra<T: Pixel>(
+  pub fn dispatch_predict_intra<T: Pixel, const BD: usize>(
     mode: PredictionMode, variant: PredictionVariant,
-    dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize,
-    ac: &[i16], angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
+    dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16],
+    angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
     edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, _cpu: CpuFeatureLevel,
   ) {
     let width = tx_size.width();
@@ -718,11 +714,11 @@ pub(crate) mod rust {
     match mode {
       PredictionMode::DC_PRED => {
         (match variant {
-          PredictionVariant::NONE => pred_dc_128,
+          PredictionVariant::NONE => pred_dc_128::<_, BD>,
           PredictionVariant::LEFT => pred_dc_left,
           PredictionVariant::TOP => pred_dc_top,
           PredictionVariant::BOTH => pred_dc,
-        })(dst, above_slice, left_slice, width, height, bit_depth)
+        })(dst, above_slice, left_slice, width, height)
       }
       PredictionMode::V_PRED if angle == 90 => {
         pred_v(dst, above_slice, width, height)
@@ -737,7 +733,7 @@ pub(crate) mod rust {
       | PredictionMode::D113_PRED
       | PredictionMode::D157_PRED
       | PredictionMode::D203_PRED
-      | PredictionMode::D67_PRED => pred_directional(
+      | PredictionMode::D67_PRED => pred_directional::<_, BD>(
         dst,
         above_slice,
         left_and_left_below_slice,
@@ -745,7 +741,6 @@ pub(crate) mod rust {
         angle as usize,
         width,
         height,
-        bit_depth,
         ief_params,
       ),
       PredictionMode::SMOOTH_PRED => {
@@ -760,28 +755,23 @@ pub(crate) mod rust {
       PredictionMode::PAETH_PRED => {
         pred_paeth(dst, above_slice, left_slice, top_left[0], width, height)
       }
-      PredictionMode::UV_CFL_PRED => (match variant {
-        PredictionVariant::NONE => pred_cfl_128,
-        PredictionVariant::LEFT => pred_cfl_left,
-        PredictionVariant::TOP => pred_cfl_top,
-        PredictionVariant::BOTH => pred_cfl,
-      })(
-        dst,
-        ac,
-        angle as i16,
-        above_slice,
-        left_slice,
-        width,
-        height,
-        bit_depth,
-      ),
+      PredictionMode::UV_CFL_PRED => {
+        (match variant {
+          PredictionVariant::NONE => pred_cfl_128::<_, BD>,
+          PredictionVariant::LEFT => pred_cfl_left::<_, BD>,
+          PredictionVariant::TOP => pred_cfl_top::<_, BD>,
+          PredictionVariant::BOTH => pred_cfl::<_, BD>,
+        })(
+          dst, ac, angle as i16, above_slice, left_slice, width, height
+        )
+      }
       _ => unimplemented!(),
     }
   }
 
   pub(crate) fn pred_dc<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize,
-    height: usize, _bit_depth: usize,
+    height: usize,
   ) {
     let edges = left[..height].iter().chain(above[..width].iter());
     let len = (width + height) as u32;
@@ -797,11 +787,11 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_dc_128<T: Pixel>(
+  pub(crate) fn pred_dc_128<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, _above: &[T], _left: &[T],
-    width: usize, height: usize, bit_depth: usize,
+    width: usize, height: usize,
   ) {
-    let v = T::cast_from(128u32 << (bit_depth - 8));
+    let v = T::cast_from(128u32 << (BD - 8));
     for line in output.rows_iter_mut().take(height) {
       line[..width].fill(v);
     }
@@ -809,7 +799,7 @@ pub(crate) mod rust {
 
   pub(crate) fn pred_dc_left<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, _above: &[T], left: &[T],
-    width: usize, height: usize, _bit_depth: usize,
+    width: usize, height: usize,
   ) {
     let sum = left[..].iter().fold(0u32, |acc, &v| {
       let v: u32 = v.into();
@@ -823,7 +813,7 @@ pub(crate) mod rust {
 
   pub(crate) fn pred_dc_top<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], _left: &[T],
-    width: usize, height: usize, _bit_depth: usize,
+    width: usize, height: usize,
   ) {
     let sum = above[..width].iter().fold(0u32, |acc, &v| {
       let v: u32 = v.into();
@@ -1051,9 +1041,9 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_cfl_inner<T: Pixel>(
+  pub(crate) fn pred_cfl_inner<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, width: usize,
-    height: usize, bit_depth: usize,
+    height: usize,
   ) {
     if alpha == 0 {
       return;
@@ -1063,7 +1053,7 @@ pub(crate) mod rust {
     assert!(output.plane_cfg.stride >= width);
     assert!(output.rows_iter().len() >= height);
 
-    let sample_max = (1 << bit_depth) - 1;
+    let sample_max = (1 << BD) - 1;
     let avg: i32 = output[0][0].into();
 
     for (line, luma) in
@@ -1077,43 +1067,43 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_cfl<T: Pixel>(
+  pub(crate) fn pred_cfl<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_128<T: Pixel>(
+  pub(crate) fn pred_cfl_128<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_128(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_128::<_, BD>(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_left<T: Pixel>(
+  pub(crate) fn pred_cfl_left<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_left(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_left(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_top<T: Pixel>(
+  pub(crate) fn pred_cfl_top<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_top(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_top(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
   #[allow(clippy::clone_double_ref)]
-  pub(crate) fn pred_directional<T: Pixel>(
+  pub(crate) fn pred_directional<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T],
     top_left: &[T], p_angle: usize, width: usize, height: usize,
-    bit_depth: usize, ief_params: Option<IntraEdgeFilterParameters>,
+    ief_params: Option<IntraEdgeFilterParameters>,
   ) {
     #[allow(clippy::collapsible_if)]
     #[allow(clippy::collapsible_else_if)]
@@ -1223,7 +1213,7 @@ pub(crate) mod rust {
       edge.copy_from_slice(edge_filtered.as_slice());
     }
 
-    fn upsample_edge<T: Pixel>(size: usize, edge: &mut [T], bit_depth: usize) {
+    fn upsample_edge<T: Pixel, const BD: usize>(size: usize, edge: &mut [T]) {
       // The input edge should be valid in the -1..size range,
       // where the -1 index is the top-left edge pixel. Since
       // negative indices are unsafe in Rust, the caller is
@@ -1247,14 +1237,14 @@ pub(crate) mod rust {
           + (9 * dup[i + 1].to_i32().unwrap())
           + (9 * dup[i + 2].to_i32().unwrap())
           - dup[i + 3].to_i32().unwrap();
-        s = ((s + 8) / 16).clamp(0, (1 << bit_depth) - 1);
+        s = ((s + 8) / 16).clamp(0, (1 << BD) - 1);
 
         edge[2 * i + 1] = T::cast_from(s);
         edge[2 * i + 2] = dup[i + 2];
       }
     }
 
-    let sample_max = (1 << bit_depth) - 1;
+    let sample_max = (1 << BD) - 1;
 
     let max_x = output.plane_cfg.width as isize - 1;
     let max_y = output.plane_cfg.height as isize - 1;
@@ -1332,7 +1322,7 @@ pub(crate) mod rust {
         p_angle as isize - 90,
       );
       if upsample_above {
-        upsample_edge(num_px.0, above_filtered.as_mut_slice(), bit_depth);
+        upsample_edge::<_, BD>(num_px.0, &mut above_filtered[..]);
       }
       upsample_left = select_ief_upsample(
         width,
@@ -1341,7 +1331,7 @@ pub(crate) mod rust {
         p_angle as isize - 180,
       );
       if upsample_left {
-        upsample_edge(num_px.1, left_filtered.as_mut_slice(), bit_depth);
+        upsample_edge::<_, BD>(num_px.1, &mut left_filtered[..]);
       }
 
       left_filtered.reverse();
@@ -1509,16 +1499,16 @@ mod test {
 
     let mut output = Plane::from_slice(&[0u8; 4 * 4], 4);
 
-    pred_dc(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [32u8; 16]);
 
-    pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [35u8; 16]);
 
-    pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [30u8; 16]);
 
-    pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [128u8; 16]);
 
     pred_v(&mut output.as_region_mut(), above, 4, 4);
@@ -1594,7 +1584,7 @@ mod test {
       [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36],
     ];
     for (&angle, expected) in angles.iter().zip(expected.iter()) {
-      pred_directional(
+      pred_directional::<_, 8>(
         &mut output.as_region_mut(),
         above,
         left,
@@ -1602,7 +1592,6 @@ mod test {
         angle,
         4,
         4,
-        8,
         None,
       );
       assert_eq!(&output.data[..], expected);
@@ -1617,7 +1606,7 @@ mod test {
 
     let mut o = Plane::from_slice(&vec![0u16; 32 * 32], 32);
 
-    pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4, 16);
+    pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4);
 
     for l in o.data.chunks(32).take(4) {
       for v in l[..4].iter() {
diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs
index 72006361fb..2f6e2b103a 100644
--- a/src/quantize/mod.rs
+++ b/src/quantize/mod.rs
@@ -36,18 +36,24 @@ pub fn get_log_tx_scale(tx_size: TxSize) -> usize {
     + Into::<usize>::into(num_pixels > 1024)
 }
 
-pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
-  let dc_q: [&[NonZeroU16; 256]; 3] =
-    [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3];
-  let bd = ((bit_depth ^ 8) >> 1).min(2);
-  dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
+pub fn dc_q<const BD: usize>(qindex: u8, delta_q: i8) -> NonZeroU16 {
+  let dc_q = match BD {
+    8 => &dc_qlookup_Q3,
+    10 => &dc_qlookup_10_Q3,
+    12 => &dc_qlookup_12_Q3,
+    _ => unimplemented!(),
+  };
+  dc_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
 }
 
-pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
-  let ac_q: [&[NonZeroU16; 256]; 3] =
-    [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3];
-  let bd = ((bit_depth ^ 8) >> 1).min(2);
-  ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
+pub fn ac_q<const BD: usize>(qindex: u8, delta_q: i8) -> NonZeroU16 {
+  let ac_q = match BD {
+    8 => &ac_qlookup_Q3,
+    10 => &ac_qlookup_10_Q3,
+    12 => &ac_qlookup_12_Q3,
+    _ => unimplemented!(),
+  };
+  ac_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
 }
 
 // TODO: Handle lossless properly.
@@ -78,8 +84,8 @@ fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 {
   }
 }
 
-pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
-  let qlookup = match bit_depth {
+pub fn select_dc_qi<const BD: usize>(quantizer: i64) -> u8 {
+  let qlookup = match BD {
     8 => &dc_qlookup_Q3,
     10 => &dc_qlookup_10_Q3,
     12 => &dc_qlookup_12_Q3,
@@ -88,8 +94,8 @@ pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
   select_qi(quantizer, qlookup)
 }
 
-pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 {
-  let qlookup = match bit_depth {
+pub fn select_ac_qi<const BD: usize>(quantizer: i64) -> u8 {
+  let qlookup = match BD {
     8 => &ac_qlookup_Q3,
     10 => &ac_qlookup_10_Q3,
     12 => &ac_qlookup_12_Q3,
@@ -218,16 +224,16 @@ mod test {
 }
 
 impl QuantizationContext {
-  pub fn update(
-    &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize,
-    dc_delta_q: i8, ac_delta_q: i8,
+  pub fn update<const BD: usize>(
+    &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, dc_delta_q: i8,
+    ac_delta_q: i8,
   ) {
     self.log_tx_scale = get_log_tx_scale(tx_size);
 
-    self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth);
+    self.dc_quant = dc_q::<BD>(qindex, dc_delta_q);
     self.dc_mul_add = divu_gen(self.dc_quant.into());
 
-    self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth);
+    self.ac_quant = ac_q::<BD>(qindex, ac_delta_q);
     self.ac_mul_add = divu_gen(self.ac_quant.into());
 
     // All of these biases were derived by measuring the cost of coding
@@ -352,15 +358,15 @@ pub mod rust {
   use super::*;
   use crate::cpu_features::CpuFeatureLevel;
 
-  pub fn dequantize<T: Coefficient>(
+  pub fn dequantize<T: Coefficient, const BD: usize>(
     qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-    bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
+    dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
   ) {
     let log_tx_scale = get_log_tx_scale(tx_size) as i32;
     let offset = (1 << log_tx_scale) - 1;
 
-    let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32;
-    let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32;
+    let dc_quant = dc_q::<BD>(qindex, dc_delta_q).get() as i32;
+    let ac_quant = ac_q::<BD>(qindex, ac_delta_q).get() as i32;
 
     for (i, (r, c)) in rcoeffs
       .iter_mut()
diff --git a/src/rate.rs b/src/rate.rs
index 3ae7b09e1b..da0d98d636 100644
--- a/src/rate.rs
+++ b/src/rate.rs
@@ -522,15 +522,14 @@ fn chroma_offset(
 }
 
 impl QuantizerParameters {
-  fn new_from_log_q(
-    log_base_q: i64, log_target_q: i64, bit_depth: usize,
-    chroma_sampling: ChromaSampling, is_intra: bool,
-    log_isqrt_mean_scale: i64,
+  fn new_from_log_q<const BD: usize>(
+    log_base_q: i64, log_target_q: i64, chroma_sampling: ChromaSampling,
+    is_intra: bool, log_isqrt_mean_scale: i64,
   ) -> QuantizerParameters {
-    let scale = log_isqrt_mean_scale + q57(QSCALE + bit_depth as i32 - 8);
+    let scale = log_isqrt_mean_scale + q57(QSCALE + BD as i32 - 8);
 
     let mut log_q_y = log_target_q;
-    if !is_intra && bit_depth == 8 {
+    if !is_intra && BD == 8 {
       log_q_y = log_target_q
         + (log_target_q >> 32) * Q_MODEL_MUL[chroma_sampling as usize]
         + Q_MODEL_ADD[chroma_sampling as usize];
@@ -552,7 +551,7 @@ impl QuantizerParameters {
     let scale = |q| bexp64((log_target_q - q) * 2 + q57(16)) as f64 / 65536.;
     let dist_scale = [scale(log_q_y), scale(log_q_u), scale(log_q_v)];
 
-    let base_q_idx = select_ac_qi(quantizer, bit_depth).max(1);
+    let base_q_idx = select_ac_qi::<BD>(quantizer).max(1);
 
     // delta_q only gets 6 bits + a sign bit, so it can differ by 63 at most.
     let min_qi = base_q_idx.saturating_sub(63).max(1);
@@ -564,14 +563,14 @@ impl QuantizerParameters {
       log_target_q,
       // TODO: Allow lossless mode; i.e. qi == 0.
       dc_qi: [
-        clamp_qi(select_dc_qi(quantizer, bit_depth)),
-        if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_u, bit_depth)) },
-        if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_v, bit_depth)) },
+        clamp_qi(select_dc_qi::<BD>(quantizer)),
+        if mono { 0 } else { clamp_qi(select_dc_qi::<BD>(quantizer_u)) },
+        if mono { 0 } else { clamp_qi(select_dc_qi::<BD>(quantizer_v)) },
       ],
       ac_qi: [
         base_q_idx,
-        if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_u, bit_depth)) },
-        if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_v, bit_depth)) },
+        if mono { 0 } else { clamp_qi(select_ac_qi::<BD>(quantizer_u)) },
+        if mono { 0 } else { clamp_qi(select_ac_qi::<BD>(quantizer_v)) },
       ],
       lambda,
       dist_scale,
@@ -701,17 +700,16 @@ impl RCState {
     }
   }
 
-  pub(crate) fn select_first_pass_qi(
-    &self, bit_depth: usize, fti: usize, chroma_sampling: ChromaSampling,
+  pub(crate) fn select_first_pass_qi<const BD: usize>(
+    &self, fti: usize, chroma_sampling: ChromaSampling,
   ) -> QuantizerParameters {
     // Adjust the quantizer for the frame type, result is Q57:
     let log_q = ((self.pass1_log_base_q + (1i64 << 11)) >> 12)
       * (MQP_Q12[fti] as i64)
       + DQP_Q57[fti];
-    QuantizerParameters::new_from_log_q(
+    QuantizerParameters::new_from_log_q::<BD>(
       self.pass1_log_base_q,
       log_q,
-      bit_depth,
       chroma_sampling,
       fti == 0,
       0,
@@ -719,7 +717,7 @@ impl RCState {
   }
 
   // TODO: Separate quantizers for Cb and Cr.
-  pub(crate) fn select_qi<T: Pixel>(
+  pub(crate) fn select_qi<T: Pixel, const BD: usize>(
     &self, ctx: &ContextInner<T>, output_frameno: u64, fti: usize,
     maybe_prev_log_base_q: Option<i64>, log_isqrt_mean_scale: i64,
   ) -> QuantizerParameters {
@@ -727,14 +725,12 @@ impl RCState {
     if self.target_bitrate <= 0 {
       // Rate control is not active.
       // Derive quantizer directly from frame type.
-      let bit_depth = ctx.config.bit_depth;
       let chroma_sampling = ctx.config.chroma_sampling;
       let (log_base_q, log_q) =
-        Self::calc_flat_quantizer(ctx.config.quantizer as u8, bit_depth, fti);
-      QuantizerParameters::new_from_log_q(
+        Self::calc_flat_quantizer::<BD>(ctx.config.quantizer as u8, fti);
+      QuantizerParameters::new_from_log_q::<BD>(
         log_base_q,
         log_q,
-        bit_depth,
         chroma_sampling,
         fti == 0,
         log_isqrt_mean_scale,
@@ -748,11 +744,8 @@ impl RCState {
       match self.twopass_state {
         // First pass of 2-pass mode: use a fixed base quantizer.
         PASS_1 => {
-          return self.select_first_pass_qi(
-            ctx.config.bit_depth,
-            fti,
-            ctx.config.chroma_sampling,
-          );
+          return self
+            .select_first_pass_qi::<BD>(fti, ctx.config.chroma_sampling);
         }
         // Second pass of 2-pass mode: we know exactly how much of each frame
         //  type there is in the current buffer window, and have estimates for
@@ -906,17 +899,16 @@ impl RCState {
       //  in the binary log domain (binary exp and log aren't too bad):
       //  rate = exp2(log2(scale) - log2(quantizer)*exp)
       // There's no easy closed form solution, so we bisection searh for it.
-      let bit_depth = ctx.config.bit_depth;
       let chroma_sampling = ctx.config.chroma_sampling;
       // TODO: Proper handling of lossless.
-      let mut log_qlo = blog64(ac_q(self.ac_qi_min, 0, bit_depth).get() as i64)
-        - q57(QSCALE + bit_depth as i32 - 8);
+      let mut log_qlo = blog64(ac_q::<BD>(self.ac_qi_min, 0).get() as i64)
+        - q57(QSCALE + BD as i32 - 8);
       // The AC quantizer tables map to values larger than the DC quantizer
       //  tables, so we use that as the upper bound to make sure we can use
       //  the full table if needed.
       let mut log_qhi = blog64(
-        ac_q(self.maybe_ac_qi_max.unwrap_or(255), 0, bit_depth).get() as i64,
-      ) - q57(QSCALE + bit_depth as i32 - 8);
+        ac_q::<BD>(self.maybe_ac_qi_max.unwrap_or(255), 0).get() as i64,
+      ) - q57(QSCALE + BD as i32 - 8);
       let mut log_base_q = (log_qlo + log_qhi) >> 1;
       while log_qlo < log_qhi {
         // Count bits contributed by each frame type using the model.
@@ -1020,20 +1012,19 @@ impl RCState {
 
       if let Some(qi_max) = self.maybe_ac_qi_max {
         let (max_log_base_q, max_log_q) =
-          Self::calc_flat_quantizer(qi_max, ctx.config.bit_depth, fti);
+          Self::calc_flat_quantizer::<BD>(qi_max, fti);
         log_base_q = cmp::min(log_base_q, max_log_base_q);
         log_q = cmp::min(log_q, max_log_q);
       }
       if self.ac_qi_min > 0 {
         let (min_log_base_q, min_log_q) =
-          Self::calc_flat_quantizer(self.ac_qi_min, ctx.config.bit_depth, fti);
+          Self::calc_flat_quantizer::<BD>(self.ac_qi_min, fti);
         log_base_q = cmp::max(log_base_q, min_log_base_q);
         log_q = cmp::max(log_q, min_log_q);
       }
-      QuantizerParameters::new_from_log_q(
+      QuantizerParameters::new_from_log_q::<BD>(
         log_base_q,
         log_q,
-        bit_depth,
         chroma_sampling,
         fti == 0,
         log_isqrt_mean_scale,
@@ -1043,8 +1034,8 @@ impl RCState {
 
   // Computes a quantizer directly from the frame type and base quantizer index,
   // without consideration for rate control.
-  fn calc_flat_quantizer(
-    base_qi: u8, bit_depth: usize, fti: usize,
+  fn calc_flat_quantizer<const BD: usize>(
+    base_qi: u8, fti: usize,
   ) -> (i64, i64) {
     // TODO: Rename "quantizer" something that indicates it is a quantizer
     //  index, and move it somewhere more sensible (or choose a better way to
@@ -1052,13 +1043,13 @@ impl RCState {
 
     // We use the AC quantizer as the source quantizer since its quantizer
     //  tables have unique entries, while the DC tables do not.
-    let ac_quantizer = ac_q(base_qi, 0, bit_depth).get() as i64;
+    let ac_quantizer = ac_q::<BD>(base_qi, 0).get() as i64;
     // Pick the nearest DC entry since an exact match may be unavailable.
-    let dc_qi = select_dc_qi(ac_quantizer, bit_depth);
-    let dc_quantizer = dc_q(dc_qi, 0, bit_depth).get() as i64;
+    let dc_qi = select_dc_qi::<BD>(ac_quantizer);
+    let dc_quantizer = dc_q::<BD>(dc_qi, 0).get() as i64;
     // Get the log quantizers as Q57.
-    let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + bit_depth as i32 - 8);
-    let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + bit_depth as i32 - 8);
+    let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + BD as i32 - 8);
+    let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + BD as i32 - 8);
     // Target the midpoint of the chosen entries.
     let log_base_q = (log_ac_q + log_dc_q + 1) >> 1;
     // Adjust the quantizer for the frame type, result is Q57:
@@ -1255,11 +1246,13 @@ impl RCState {
     cur_pos
   }
 
-  pub(crate) fn select_pass1_log_base_q<T: Pixel>(
+  pub(crate) fn select_pass1_log_base_q<T: Pixel, const BD: usize>(
     &self, ctx: &ContextInner<T>, output_frameno: u64,
   ) -> i64 {
     assert_eq!(self.twopass_state, PASS_SINGLE);
-    self.select_qi(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0).log_base_q
+    self
+      .select_qi::<_, BD>(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0)
+      .log_base_q
   }
 
   // Initialize the first pass and emit a placeholder summary
diff --git a/src/rdo.rs b/src/rdo.rs
index c92b383b92..1dfdba438d 100644
--- a/src/rdo.rs
+++ b/src/rdo.rs
@@ -139,9 +139,13 @@ pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
 }
 
 #[allow(unused)]
-pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
+pub fn cdef_dist_wxh<
+  T: Pixel,
+  F: Fn(Area, BlockSize) -> DistortionScale,
+  const BD: usize,
+>(
   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel,
+  compute_bias: F, cpu: CpuFeatureLevel,
 ) -> Distortion {
   debug_assert!(src1.plane_cfg.xdec == 0);
   debug_assert!(src1.plane_cfg.ydec == 0);
@@ -155,12 +159,11 @@ pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
       let kernel_w = (w - x).min(8);
       let area = Area::StartingAt { x: x as isize, y: y as isize };
 
-      let value = RawDistortion(cdef_dist_kernel(
+      let value = RawDistortion(cdef_dist_kernel::<_, BD>(
         &src1.subregion(area),
         &src2.subregion(area),
         kernel_w,
         kernel_h,
-        bit_depth,
         cpu,
       ) as u64);
 
@@ -174,9 +177,13 @@ pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
 
 /// Sum of Squared Error for a wxh block
 /// Currently limited to w and h of valid blocks
-pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
+pub fn sse_wxh<
+  T: Pixel,
+  F: Fn(Area, BlockSize) -> DistortionScale,
+  const BD: usize,
+>(
   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
-  compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
+  compute_bias: F, cpu: CpuFeatureLevel,
 ) -> Distortion {
   // See get_weighted_sse in src/dist.rs.
   // Provide a scale to get_weighted_sse for each square region of this size.
@@ -218,9 +225,7 @@ pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
     }
   }
 
-  Distortion(get_weighted_sse(
-    src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
-  ))
+  Distortion(get_weighted_sse(src1, src2, buf, buf_stride, w, h, cpu))
 }
 
 pub const fn clip_visible_bsize(
@@ -249,7 +254,7 @@ pub const fn clip_visible_bsize(
 }
 
 // Compute the pixel-domain distortion for an encode
-fn compute_distortion<T: Pixel>(
+fn compute_distortion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
   is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
 ) -> ScaledDistortion {
@@ -272,12 +277,11 @@ fn compute_distortion<T: Pixel>(
   }
 
   let mut distortion = match fi.config.tune {
-    Tune::Psychovisual => cdef_dist_wxh(
+    Tune::Psychovisual => cdef_dist_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
       visible_h,
-      fi.sequence.bit_depth,
       |bias_area, bsize| {
         distortion_scale(
           fi,
@@ -287,7 +291,7 @@ fn compute_distortion<T: Pixel>(
       },
       fi.cpu_feature_level,
     ),
-    Tune::Psnr => sse_wxh(
+    Tune::Psnr => sse_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
@@ -299,7 +303,6 @@ fn compute_distortion<T: Pixel>(
           bsize,
         )
       },
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     ),
   } * fi.dist_scale[0];
@@ -323,7 +326,7 @@ fn compute_distortion<T: Pixel>(
     for p in 1..3 {
       let input_region = ts.input_tile.planes[p].subregion(area);
       let rec_region = ts.rec.planes[p].subregion(area);
-      distortion += sse_wxh(
+      distortion += sse_wxh::<_, _, BD>(
         &input_region,
         &rec_region,
         chroma_w,
@@ -335,7 +338,6 @@ fn compute_distortion<T: Pixel>(
             bsize,
           )
         },
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       ) * fi.dist_scale[p];
     }
@@ -344,7 +346,7 @@ fn compute_distortion<T: Pixel>(
 }
 
 // Compute the transform-domain distortion for an encode
-fn compute_tx_distortion<T: Pixel>(
+fn compute_tx_distortion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
   is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
   skip: bool, luma_only: bool,
@@ -372,7 +374,7 @@ fn compute_tx_distortion<T: Pixel>(
   }
 
   let mut distortion = if skip {
-    sse_wxh(
+    sse_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
@@ -384,7 +386,6 @@ fn compute_tx_distortion<T: Pixel>(
           bsize,
         )
       },
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     ) * fi.dist_scale[0]
   } else {
@@ -411,7 +412,7 @@ fn compute_tx_distortion<T: Pixel>(
     for p in 1..3 {
       let input_region = ts.input_tile.planes[p].subregion(area);
       let rec_region = ts.rec.planes[p].subregion(area);
-      distortion += sse_wxh(
+      distortion += sse_wxh::<_, _, BD>(
         &input_region,
         &rec_region,
         chroma_w,
@@ -423,7 +424,6 @@ fn compute_tx_distortion<T: Pixel>(
             bsize,
           )
         },
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       ) * fi.dist_scale[p];
     }
@@ -720,7 +720,7 @@ pub fn compute_rd_cost<T: Pixel>(
   fi.lambda.mul_add(rate_in_bits, distortion.0 as f64)
 }
 
-pub fn rdo_tx_size_type<T: Pixel>(
+pub fn rdo_tx_size_type<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -759,7 +759,7 @@ pub fn rdo_tx_size_type<T: Pixel>(
       if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
 
     // Luma plane transform type decision
-    let (tx_type, rd_cost) = rdo_tx_type_decision(
+    let (tx_type, rd_cost) = rdo_tx_type_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -810,7 +810,7 @@ const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
 }
 
 #[inline]
-fn luma_chroma_mode_rdo<T: Pixel>(
+fn luma_chroma_mode_rdo<T: Pixel, const BD: usize>(
   luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
   tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, rdo_type: RDOType,
@@ -857,7 +857,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
     for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
       cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
 
-      let (tx_size, tx_type) = rdo_tx_size_type(
+      let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>(
         fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
       );
       for &chroma_mode in mode_set_chroma.iter() {
@@ -878,7 +878,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
           luma_mode_is_intra && tx_size.block_size() != bsize;
 
         encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
-        let (has_coeff, tx_dist) = encode_block_post_cdef(
+        let (has_coeff, tx_dist) = encode_block_post_cdef::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -903,7 +903,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
 
         let rate = wr.tell_frac() - tell;
         let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
-          compute_tx_distortion(
+          compute_tx_distortion::<_, BD>(
             fi,
             ts,
             bsize,
@@ -914,7 +914,14 @@ fn luma_chroma_mode_rdo<T: Pixel>(
             false,
           )
         } else {
-          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
+          compute_distortion::<_, BD>(
+            fi,
+            ts,
+            bsize,
+            is_chroma_block,
+            tile_bo,
+            false,
+          )
         };
         let is_zero_dist = distortion.0 == 0;
         let rd = compute_rd_cost(fi, rate, distortion);
@@ -956,7 +963,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
 ///
 /// - If the best RD found is negative.
 ///   This should never happen and indicates a development error.
-pub fn rdo_mode_decision<T: Pixel>(
+pub fn rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig,
@@ -975,7 +982,7 @@ pub fn rdo_mode_decision<T: Pixel>(
   let mut best = if fi.frame_type.has_inter() {
     assert!(fi.frame_type != FrameType::KEY);
 
-    inter_frame_rdo_mode_decision(
+    inter_frame_rdo_mode_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -993,7 +1000,7 @@ pub fn rdo_mode_decision<T: Pixel>(
     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
 
   if !best.skip {
-    best = intra_frame_rdo_mode_decision(
+    best = intra_frame_rdo_mode_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -1014,7 +1021,7 @@ pub fn rdo_mode_decision<T: Pixel>(
     let mut wr = WriterCounter::new();
     let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
 
-    write_tx_blocks(
+    write_tx_blocks::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -1034,7 +1041,9 @@ pub fn rdo_mode_decision<T: Pixel>(
     );
     cw.rollback(&cw_checkpoint);
     if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
-      if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
+      if let Some(cfl) =
+        rdo_cfl_alpha::<_, BD>(ts, tile_bo, bsize, best.tx_size, fi)
+      {
         let mut wr = WriterCounter::new();
         let tell = wr.tell_frac();
 
@@ -1047,7 +1056,7 @@ pub fn rdo_mode_decision<T: Pixel>(
           tile_bo,
           best.skip,
         );
-        let (has_coeff, _) = encode_block_post_cdef(
+        let (has_coeff, _) = encode_block_post_cdef::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -1073,8 +1082,14 @@ pub fn rdo_mode_decision<T: Pixel>(
         let rate = wr.tell_frac() - tell;
 
         // For CFL, tx-domain distortion is not an option.
-        let distortion =
-          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
+        let distortion = compute_distortion::<_, BD>(
+          fi,
+          ts,
+          bsize,
+          is_chroma_block,
+          tile_bo,
+          false,
+        );
         let rd = compute_rd_cost(fi, rate, distortion);
         if rd < best.rd_cost {
           best.rd_cost = rd;
@@ -1113,7 +1128,7 @@ pub fn rdo_mode_decision<T: Pixel>(
   }
 }
 
-fn inter_frame_rdo_mode_decision<T: Pixel>(
+fn inter_frame_rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
@@ -1175,7 +1190,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
       pmv[1] = mv_stack[1].this_mv;
     }
 
-    let res = estimate_motion(
+    let res = estimate_motion::<_, BD>(
       fi,
       ts,
       bsize.width(),
@@ -1320,7 +1335,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
       let mut rec_region =
         rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
 
-      luma_mode.predict_inter(
+      luma_mode.predict_inter::<_, BD>(
         fi,
         tile_rect,
         0,
@@ -1337,12 +1352,11 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
         .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
       let plane_ref = rec_region.as_const();
 
-      let satd = get_satd(
+      let satd = get_satd::<_, BD>(
         &plane_org,
         &plane_ref,
         bsize.width(),
         bsize.height(),
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       );
       satds.push(satd);
@@ -1361,7 +1375,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
     |&((luma_mode, i), mvs, _satd)| {
       let mode_set_chroma = ArrayVec::from([luma_mode]);
 
-      luma_chroma_mode_rdo(
+      luma_chroma_mode_rdo::<_, BD>(
         luma_mode,
         fi,
         bsize,
@@ -1385,7 +1399,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
   best
 }
 
-fn intra_frame_rdo_mode_decision<T: Pixel>(
+fn intra_frame_rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
@@ -1432,7 +1446,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
         let rec = &ts.rec.planes[0].as_const();
         let po = tile_bo.plane_offset(rec.plane_cfg);
         // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
-        get_intra_edges(
+        get_intra_edges::<_, BD>(
           rec,
           tile_bo,
           0,
@@ -1440,7 +1454,6 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
           bsize,
           po,
           tx_size,
-          fi.sequence.bit_depth,
           None,
           fi.sequence.enable_intra_edge_filter,
           IntraParam::None,
@@ -1466,11 +1479,10 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
         let mut rec_region =
           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
         // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
-        luma_mode.predict_intra(
+        luma_mode.predict_intra::<_, BD>(
           tile_rect,
           &mut rec_region,
           tx_size,
-          fi.sequence.bit_depth,
           &[0i16; 2],
           IntraParam::None,
           if luma_mode.is_directional() { ief_params } else { None },
@@ -1482,12 +1494,11 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
           .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
         let plane_ref = rec_region.as_const();
 
-        satds_all[luma_mode as usize] = get_satd(
+        satds_all[luma_mode as usize] = get_satd::<_, BD>(
           &plane_org,
           &plane_ref,
           tx_size.width(),
           tx_size.height(),
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         );
       }
@@ -1507,7 +1518,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
     if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
       mode_set_chroma.push(PredictionMode::DC_PRED);
     }
-    luma_chroma_mode_rdo(
+    luma_chroma_mode_rdo::<_, BD>(
       luma_mode,
       fi,
       bsize,
@@ -1541,7 +1552,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
     let mut best_angle_delta = best.angle_delta;
     let mut angle_delta_rdo = |y, uv| -> AngleDelta {
       if best.angle_delta.y != y || best.angle_delta.uv != uv {
-        luma_chroma_mode_rdo(
+        luma_chroma_mode_rdo::<_, BD>(
           best.pred_mode_luma,
           fi,
           bsize,
@@ -1581,7 +1592,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
 /// # Panics
 ///
 /// - If the block size is invalid for subsampling.
-pub fn rdo_cfl_alpha<T: Pixel>(
+pub fn rdo_cfl_alpha<T: Pixel, const BD: usize>(
   ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
   luma_tx_size: TxSize, fi: &FrameInvariants<T>,
 ) -> Option<CFLParams> {
@@ -1613,7 +1624,7 @@ pub fn rdo_cfl_alpha<T: Pixel>(
       let rec = &mut ts.rec.planes[p];
       let input = &ts.input_tile.planes[p];
       let po = tile_bo.plane_offset(rec.plane_cfg);
-      let edge_buf = get_intra_edges(
+      let edge_buf = get_intra_edges::<_, BD>(
         &rec.as_const(),
         tile_bo,
         0,
@@ -1621,7 +1632,6 @@ pub fn rdo_cfl_alpha<T: Pixel>(
         bsize,
         po,
         uv_tx_size,
-        fi.sequence.bit_depth,
         Some(PredictionMode::UV_CFL_PRED),
         fi.sequence.enable_intra_edge_filter,
         IntraParam::None,
@@ -1629,24 +1639,22 @@ pub fn rdo_cfl_alpha<T: Pixel>(
       let mut alpha_cost = |alpha: i16| -> u64 {
         let mut rec_region =
           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
-        PredictionMode::UV_CFL_PRED.predict_intra(
+        PredictionMode::UV_CFL_PRED.predict_intra::<_, BD>(
           tile_rect,
           &mut rec_region,
           uv_tx_size,
-          fi.sequence.bit_depth,
           &ac.data,
           IntraParam::Alpha(alpha),
           None,
           &edge_buf,
           fi.cpu_feature_level,
         );
-        sse_wxh(
+        sse_wxh::<_, _, BD>(
           &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
           &rec_region.as_const(),
           visible_tx_w,
           visible_tx_h,
           |_, _| DistortionScale::default(), // We're not doing RDO here.
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         )
         .0
@@ -1688,7 +1696,7 @@ pub fn rdo_cfl_alpha<T: Pixel>(
 ///   This should never happen and indicates a development error.
 /// - If the best RD found is negative.
 ///   This should never happen and indicates a development error.
-pub fn rdo_tx_type_decision<T: Pixel>(
+pub fn rdo_tx_type_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
   mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -1726,7 +1734,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
     }
 
     if is_inter {
-      motion_compensate(
+      motion_compensate::<_, BD>(
         fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
       );
     }
@@ -1734,7 +1742,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
     let mut wr = WriterCounter::new();
     let tell = wr.tell_frac();
     let (_, tx_dist) = if is_inter {
-      write_tx_tree(
+      write_tx_tree::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1751,7 +1759,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
         need_recon_pixel,
       )
     } else {
-      write_tx_blocks(
+      write_tx_blocks::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1773,7 +1781,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
 
     let rate = wr.tell_frac() - tell;
     let distortion = if fi.use_tx_domain_distortion {
-      compute_tx_distortion(
+      compute_tx_distortion::<_, BD>(
         fi,
         ts,
         bsize,
@@ -1784,7 +1792,14 @@ pub fn rdo_tx_type_decision<T: Pixel>(
         true,
       )
     } else {
-      compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
+      compute_distortion::<_, BD>(
+        fi,
+        ts,
+        bsize,
+        is_chroma_block,
+        tile_bo,
+        true,
+      )
     };
     cw.rollback(cw_checkpoint.as_ref().unwrap());
 
@@ -1836,14 +1851,14 @@ pub fn get_sub_partitions(
 }
 
 #[inline(always)]
-fn rdo_partition_none<T: Pixel>(
+fn rdo_partition_none<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
 ) -> f64 {
   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
 
-  let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+  let mode = rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
   let cost = mode.rd_cost;
 
   child_modes.push(mode);
@@ -1853,7 +1868,7 @@ fn rdo_partition_none<T: Pixel>(
 
 // VERTICAL, HORIZONTAL or simple SPLIT
 #[inline(always)]
-fn rdo_partition_simple<T: Pixel, W: Writer>(
+fn rdo_partition_simple<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
@@ -1895,7 +1910,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
 
     if has_cols && has_rows {
       let mode_decision =
-        rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
+        rdo_mode_decision::<_, BD>(fi, ts, cw, subsize, offset, inter_cfg);
 
       rd_cost_sum += mode_decision.rd_cost;
 
@@ -1907,7 +1922,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
           if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
         cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
       }
-      encode_block_with_modes(
+      encode_block_with_modes::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1935,7 +1950,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
 ///
 /// - If the best RD found is negative.
 ///   This should never happen, and indicates a development error.
-pub fn rdo_partition_decision<T: Pixel, W: Writer>(
+pub fn rdo_partition_decision<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -1960,7 +1975,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
 
     let cost = match partition {
       PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
-        Some(rdo_partition_none(
+        Some(rdo_partition_none::<_, BD>(
           fi,
           ts,
           cw,
@@ -1971,7 +1986,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
         ))
       }
       PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
-        rdo_partition_simple(
+        rdo_partition_simple::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2012,7 +2027,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
   }
 }
 
-fn rdo_loop_plane_error<T: Pixel>(
+fn rdo_loop_plane_error<T: Pixel, const BD: usize>(
   base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
   sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
   blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
@@ -2054,23 +2069,21 @@ fn rdo_loop_plane_error<T: Pixel>(
           // For loop filters, We intentionally use cdef_dist even with
           // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
           // significant negative impact on other metrics and visual quality.
-          RawDistortion(cdef_dist_kernel(
+          RawDistortion(cdef_dist_kernel::<_, BD>(
             &src_region,
             &test_region,
             8,
             8,
-            fi.sequence.bit_depth,
             fi.cpu_feature_level,
           ) as u64)
             * bias
         } else {
-          sse_wxh(
+          sse_wxh::<_, _, BD>(
             &src_region,
             &test_region,
             8 >> xdec,
             8 >> ydec,
             |_, _| bias,
-            fi.sequence.bit_depth,
             fi.cpu_feature_level,
           )
         };
@@ -2088,7 +2101,7 @@ fn rdo_loop_plane_error<T: Pixel>(
 /// # Panics
 ///
 /// - If both CDEF and LRF are disabled.
-pub fn rdo_loop_decision<T: Pixel, W: Writer>(
+pub fn rdo_loop_decision<T: Pixel, W: Writer, const BD: usize>(
   base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
   ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
   deblock_p: bool,
@@ -2285,7 +2298,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
     // Find a good deblocking filter solution for the passed in area.
     // This is not RDO of deblocking itself, merely a solution to get
     // better results from CDEF/LRF RDO.
-    let deblock_levels = deblock_filter_optimize(
+    let deblock_levels = deblock_filter_optimize::<_, _, BD>(
       fi,
       &rec_subset.as_tile(),
       &src_subset,
@@ -2301,13 +2314,12 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
       deblock_copy.levels = deblock_levels;
 
       // finally, deblock the temp frame
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         &deblock_copy,
         &mut rec_subset.as_tile_mut(),
         &tileblocks_subset.as_const(),
         crop_w,
         crop_h,
-        fi.sequence.bit_depth,
         planes,
       );
     }
@@ -2335,7 +2347,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
     if cdef_work.is_some() {
       Some((
         &rec_subset,
-        cdef_analyze_superblock_range(
+        cdef_analyze_superblock_range::<_, BD>(
           fi,
           &rec_subset,
           &tileblocks_subset.as_const(),
@@ -2382,7 +2394,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
             let mut err = ScaledDistortion::zero();
             let mut rate = 0;
 
-            cdef_filter_superblock(
+            cdef_filter_superblock::<_, BD>(
               fi,
               &rec_subset,
               &mut cdef_ref.as_tile_mut(),
@@ -2418,7 +2430,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 // We have a valid LRU, apply LRF, compute error
                 match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
                   RestorationFilter::None {} => {
-                    err += rdo_loop_plane_error(
+                    err += rdo_loop_plane_error::<_, BD>(
                       base_sbo,
                       loop_sbo,
                       1,
@@ -2459,7 +2471,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                       &cdef_ref.planes[pli].slice(loop_po),
                       &cdef_ref.planes[pli].slice(loop_po),
                     );
-                    sgrproj_stripe_filter(
+                    sgrproj_stripe_filter::<_, _, BD>(
                       set,
                       xqd,
                       fi,
@@ -2473,7 +2485,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                         height: vis_height,
                       }),
                     );
-                    err += rdo_loop_plane_error(
+                    err += rdo_loop_plane_error::<_, BD>(
                       base_sbo,
                       loop_sbo,
                       1,
@@ -2496,7 +2508,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 }
               } else {
                 // No actual LRU here, compute error directly from CDEF output.
-                err += rdo_loop_plane_error(
+                err += rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   1,
@@ -2540,7 +2552,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
           // Keep cdef output up to date; we need it for restoration
           // both below and above (padding)
-          cdef_filter_superblock(
+          cdef_filter_superblock::<_, BD>(
             fi,
             rec_copy,
             &mut cdef_ref_tm,
@@ -2605,7 +2617,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
               // Check the no filter option
               {
-                let err = rdo_loop_plane_error(
+                let err = rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   lru_sb_w,
@@ -2660,7 +2672,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
               for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
               {
-                let (xqd0, xqd1) = sgrproj_solve(
+                let (xqd0, xqd1) = sgrproj_solve::<_, BD>(
                   set,
                   fi,
                   &ts.integral_buffer,
@@ -2673,7 +2685,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 let current_lrf =
                   RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
                 if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
-                  sgrproj_stripe_filter(
+                  sgrproj_stripe_filter::<_, _, BD>(
                     set,
                     xqd,
                     fi,
@@ -2688,7 +2700,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                     }),
                   );
                 }
-                let err = rdo_loop_plane_error(
+                let err = rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   lru_sb_w,
diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs
index 7414f09d3b..037ac8aadd 100644
--- a/src/scenechange/mod.rs
+++ b/src/scenechange/mod.rs
@@ -86,8 +86,6 @@ pub struct SceneChangeDetector<T: Pixel> {
   score_deque: Vec<ScenecutResult>,
   /// Number of pixels in scaled frame for fast mode
   pixels: usize,
-  /// The bit depth of the video.
-  bit_depth: usize,
   /// The CPU feature level to be used.
   cpu_feature_level: CpuFeatureLevel,
   encoder_config: EncoderConfig,
@@ -147,7 +145,6 @@ impl<T: Pixel> SceneChangeDetector<T> {
       deque_offset,
       score_deque,
       pixels,
-      bit_depth,
       cpu_feature_level,
       encoder_config,
       sequence,
@@ -165,7 +162,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   ///
   /// This will gracefully handle the first frame in the video as well.
   #[hawktracer(analyze_next_frame)]
-  pub fn analyze_next_frame(
+  pub fn analyze_next_frame<const BD: usize>(
     &mut self, frame_set: &[&Arc<Frame<T>>], input_frameno: u64,
     previous_keyframe: u64,
   ) -> bool {
@@ -196,9 +193,13 @@ impl<T: Pixel> SceneChangeDetector<T> {
       && frame_set.len() > self.deque_offset + 1
       && self.score_deque.is_empty()
     {
-      self.initialize_score_deque(frame_set, input_frameno, self.deque_offset);
+      self.initialize_score_deque::<BD>(
+        frame_set,
+        input_frameno,
+        self.deque_offset,
+      );
     } else if self.score_deque.is_empty() {
-      self.initialize_score_deque(
+      self.initialize_score_deque::<BD>(
         frame_set,
         input_frameno,
         frame_set.len() - 1,
@@ -209,7 +210,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     // Running single frame comparison and adding it to deque
     // Decrease deque offset if there is no new frames
     if frame_set.len() > self.deque_offset + 1 {
-      self.run_comparison(
+      self.run_comparison::<BD>(
         frame_set[self.deque_offset].clone(),
         frame_set[self.deque_offset + 1].clone(),
         input_frameno + self.deque_offset as u64,
@@ -219,7 +220,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     }
 
     // Adaptive scenecut check
-    let (scenecut, score) = self.adaptive_scenecut();
+    let (scenecut, score) = self.adaptive_scenecut::<BD>();
     let scenecut = self.handle_min_max_intervals(distance).unwrap_or(scenecut);
     debug!(
       "[SC-Detect] Frame {}: Raw={:5.1}  ImpBl={:5.1}  Bwd={:5.1}  Fwd={:5.1}  Th={:.1}  {}",
@@ -253,12 +254,12 @@ impl<T: Pixel> SceneChangeDetector<T> {
   }
 
   // Initially fill score deque with frame scores
-  fn initialize_score_deque(
+  fn initialize_score_deque<const BD: usize>(
     &mut self, frame_set: &[&Arc<Frame<T>>], input_frameno: u64,
     init_len: usize,
   ) {
     for x in 0..init_len {
-      self.run_comparison(
+      self.run_comparison::<BD>(
         frame_set[x].clone(),
         frame_set[x + 1].clone(),
         input_frameno + x as u64,
@@ -268,14 +269,14 @@ impl<T: Pixel> SceneChangeDetector<T> {
 
   /// Runs scene change comparison beetween 2 given frames
   /// Insert result to start of score deque
-  fn run_comparison(
+  fn run_comparison<const BD: usize>(
     &mut self, frame1: Arc<Frame<T>>, frame2: Arc<Frame<T>>,
     input_frameno: u64,
   ) {
     let mut result = if self.speed_mode == SceneDetectionSpeed::Fast {
       self.fast_scenecut(frame1, frame2)
     } else {
-      self.cost_scenecut(frame1, frame2, input_frameno)
+      self.cost_scenecut::<BD>(frame1, frame2, input_frameno)
     };
 
     // Subtract the highest metric value of surrounding frames from the current one
@@ -322,7 +323,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   /// Compares current scene score to adapted threshold based on previous scores
   /// Value of current frame is offset by lookahead, if lookahead >=5
   /// Returns true if current scene score is higher than adapted threshold
-  fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) {
+  fn adaptive_scenecut<const BD: usize>(&mut self) -> (bool, ScenecutResult) {
     let score = self.score_deque[self.deque_offset];
 
     // We use the importance block algorithm's cost metrics as a secondary algorithm
@@ -333,8 +334,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     // the importance block algorithm is over the threshold either on this frame (hard scenecut)
     // or within the past few frames (pan). This helps filter out a few false positives
     // produced by the cost-based algorithm.
-    let imp_block_threshold =
-      IMP_BLOCK_DIFF_THRESHOLD * (self.bit_depth as f64) / 8.0;
+    let imp_block_threshold = IMP_BLOCK_DIFF_THRESHOLD * (BD as f64) / 8.0;
     if !&self.score_deque[self.deque_offset..]
       .iter()
       .any(|result| result.imp_block_cost >= imp_block_threshold)
diff --git a/src/scenechange/standard.rs b/src/scenechange/standard.rs
index 1f058271df..2f452164e9 100644
--- a/src/scenechange/standard.rs
+++ b/src/scenechange/standard.rs
@@ -18,7 +18,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   /// We gather both intra and inter costs for the frames,
   /// as well as an importance-block-based difference,
   /// and use all three metrics.
-  pub(super) fn cost_scenecut(
+  pub(super) fn cost_scenecut<const BD: usize>(
     &mut self, frame1: Arc<Frame<T>>, frame2: Arc<Frame<T>>,
     input_frameno: u64,
   ) -> ScenecutResult {
@@ -49,10 +49,9 @@ impl<T: Pixel> SceneChangeDetector<T> {
 
         let intra_costs =
           self.intra_costs.entry(input_frameno).or_insert_with(|| {
-            estimate_intra_costs(
+            estimate_intra_costs::<_, BD>(
               temp_plane,
               &*frame2,
-              self.bit_depth,
               self.cpu_feature_level,
             )
           });
@@ -67,10 +66,9 @@ impl<T: Pixel> SceneChangeDetector<T> {
         };
       });
       s.spawn(|_| {
-        mv_inter_cost = estimate_inter_costs(
+        mv_inter_cost = estimate_inter_costs::<_, BD>(
           frame2_inter_ref,
           frame1,
-          self.bit_depth,
           self.encoder_config.clone(),
           self.sequence.clone(),
           buffer,
diff --git a/src/segmentation.rs b/src/segmentation.rs
index 36ee42fb1c..776d90c265 100644
--- a/src/segmentation.rs
+++ b/src/segmentation.rs
@@ -19,7 +19,7 @@ use crate::FrameState;
 
 pub const MAX_SEGMENTS: usize = 8;
 
-pub fn segmentation_optimize<T: Pixel>(
+pub fn segmentation_optimize<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>,
 ) {
   assert!(fi.enable_segmentation);
@@ -50,11 +50,11 @@ pub fn segmentation_optimize<T: Pixel>(
       }
       assert_ne!(min_segment, MAX_SEGMENTS);
       fs.segmentation.min_segment = min_segment as u8;
-      fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth);
+      fs.segmentation.update_threshold::<BD>(fi.base_q_idx);
       return;
     }
 
-    segmentation_optimize_inner(fi, fs, offset_lower_limit);
+    segmentation_optimize_inner::<_, BD>(fi, fs, offset_lower_limit);
 
     /* Figure out parameters */
     fs.segmentation.preskip = false;
@@ -73,7 +73,7 @@ pub fn segmentation_optimize<T: Pixel>(
 }
 
 // Select target quantizers for each segment by fitting to log(scale).
-fn segmentation_optimize_inner<T: Pixel>(
+fn segmentation_optimize_inner<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, offset_lower_limit: i16,
 ) {
   use crate::quantize::{ac_q, select_ac_qi};
@@ -112,8 +112,7 @@ fn segmentation_optimize_inner<T: Pixel>(
   // See `distortion_scale_for` for more information.
   let compute_delta = |centroids: &[i16]| {
     use crate::util::{bexp64, blog64};
-    let log2_base_ac_q_q57 =
-      blog64(ac_q(fi.base_q_idx, 0, fi.config.bit_depth).get().into());
+    let log2_base_ac_q_q57 = blog64(ac_q::<BD>(fi.base_q_idx, 0).get().into());
     centroids
       .iter()
       .rev()
@@ -128,8 +127,7 @@ fn segmentation_optimize_inner<T: Pixel>(
       // and take the delta from the base quantizer index.
       .map(|q| {
         // Avoid going into lossless mode by never bringing qidx below 1.
-        select_ac_qi(q, fi.config.bit_depth).max(1) as i16
-          - fi.base_q_idx as i16
+        select_ac_qi::<BD>(q).max(1) as i16 - fi.base_q_idx as i16
       })
       .collect::<ArrayVec<_, MAX_SEGMENTS>>()
   };
@@ -155,7 +153,7 @@ fn segmentation_optimize_inner<T: Pixel>(
     data[SegLvl::SEG_LVL_ALT_Q as usize] = delta.max(offset_lower_limit);
   }
 
-  fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth);
+  fs.segmentation.update_threshold::<BD>(fi.base_q_idx);
 }
 
 pub fn select_segment<T: Pixel>(
diff --git a/src/transform/forward.rs b/src/transform/forward.rs
index ac9f4e850b..d50ad8435a 100644
--- a/src/transform/forward.rs
+++ b/src/transform/forward.rs
@@ -98,9 +98,9 @@ pub mod rust {
   ///
   /// - If called with an invalid combination of `tx_size` and `tx_type`
   #[cold_for_target_arch("x86_64")]
-  pub fn forward_transform<T: Coefficient>(
+  pub fn forward_transform<T: Coefficient, const BD: usize>(
     input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-    tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
+    tx_type: TxType, _cpu: CpuFeatureLevel,
   ) {
     assert!(valid_av1_transform(tx_size, tx_type));
 
@@ -117,7 +117,7 @@ pub mod rust {
     let mut tmp: Aligned<[i32; 64 * 64]> = unsafe { Aligned::uninitialized() };
     let buf = &mut tmp.data[..txfm_size_col * txfm_size_row];
 
-    let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
+    let cfg = Txfm2DFlipCfg::fwd::<BD>(tx_type, tx_size);
 
     let txfm_func_col = get_func(cfg.txfm_type_col);
     let txfm_func_row = get_func(cfg.txfm_type_row);
diff --git a/src/transform/forward_shared.rs b/src/transform/forward_shared.rs
index 2c818fb89e..221b99f4d3 100644
--- a/src/transform/forward_shared.rs
+++ b/src/transform/forward_shared.rs
@@ -119,7 +119,7 @@ impl Txfm2DFlipCfg {
   /// # Panics
   ///
   /// - If called with an invalid combination of `tx_size` and `tx_type`
-  pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self {
+  pub fn fwd<const BD: usize>(tx_type: TxType, tx_size: TxSize) -> Self {
     let tx_type_1d_col = VTX_TAB[tx_type as usize];
     let tx_type_1d_row = HTX_TAB[tx_type as usize];
     let txw_idx = tx_size.width_index();
@@ -134,7 +134,7 @@ impl Txfm2DFlipCfg {
       tx_size,
       ud_flip,
       lr_flip,
-      shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2],
+      shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(BD - 8) / 2],
       txfm_type_col,
       txfm_type_row,
     }
diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs
index cfe136352b..54c0a2d575 100644
--- a/src/transform/inverse.rs
+++ b/src/transform/inverse.rs
@@ -1602,9 +1602,9 @@ pub(crate) mod rust {
   use std::cmp;
 
   #[cold_for_target_arch("x86_64", "aarch64")]
-  pub fn inverse_transform_add<T: Pixel>(
+  pub fn inverse_transform_add<T: Pixel, const BD: usize>(
     input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
-    tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
+    tx_size: TxSize, tx_type: TxType, _cpu: CpuFeatureLevel,
   ) {
     let width: usize = tx_size.width();
     let height: usize = tx_size.height();
@@ -1619,7 +1619,7 @@ pub(crate) mod rust {
     let tx_types_1d = get_1d_tx_types(tx_type);
 
     // perform inv txfm on every row
-    let range = bd + 8;
+    let range = BD + 8;
     let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3];
     // 64 point transforms only signal 32 coeffs. We only take chunks of 32
     //   and skip over the last 32 transforms here.
@@ -1645,7 +1645,7 @@ pub(crate) mod rust {
     }
 
     // perform inv txfm on every col
-    let range = cmp::max(bd + 6, 16);
+    let range = cmp::max(BD + 6, 16);
     let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3];
     for c in 0..width {
       let mut temp_in: [i32; 64] = [0; 64];
@@ -1664,7 +1664,7 @@ pub(crate) mod rust {
         .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
       {
         let v: i32 = (*out).as_();
-        let v = clamp(v + round_shift(*temp, 4), 0, (1 << bd) - 1);
+        let v = clamp(v + round_shift(*temp, 4), 0, (1 << BD) - 1);
         *out = T::cast_from(v);
       }
     }
diff --git a/src/transform/mod.rs b/src/transform/mod.rs
index d14913e133..09c4824061 100644
--- a/src/transform/mod.rs
+++ b/src/transform/mod.rs
@@ -474,14 +474,20 @@ mod test {
       *d = T::cast_from(random::<u8>());
       *r = i16::cast_from(*s) - i16::cast_from(*d);
     }
-    forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu);
+    forward_transform::<_, 8>(
+      res,
+      freq,
+      tx_size.width(),
+      tx_size,
+      tx_type,
+      cpu,
+    );
     inverse_transform_add(
       freq,
       &mut dst.as_region_mut(),
       coeff_area,
       tx_size,
       tx_type,
-      8,
       cpu,
     );
 

From ee64e5555b22d45ed4ee80f61ee8da3130f74295 Mon Sep 17 00:00:00 2001
From: Josh Holmer <jholmer.in@gmail.com>
Date: Wed, 1 Feb 2023 03:22:19 -0500
Subject: [PATCH 2/2] Fix tests

---
 src/activity.rs               |   9 ++-
 src/api/test.rs               |   8 +--
 src/asm/shared/predict.rs     |   7 +--
 src/asm/x86/cdef.rs           |   4 +-
 src/asm/x86/dist/cdef_dist.rs |   8 +--
 src/asm/x86/dist/mod.rs       | 111 ++++++++++++++++++++++------------
 src/dist.rs                   |   1 -
 src/predict.rs                |   2 +-
 src/transform/mod.rs          |  14 ++---
 9 files changed, 101 insertions(+), 63 deletions(-)

diff --git a/src/activity.rs b/src/activity.rs
index 79a634540f..facca82125 100644
--- a/src/activity.rs
+++ b/src/activity.rs
@@ -258,8 +258,13 @@ mod ssim_boost_tests {
     let scale = ((1 << bd) - 1) << (6 - 2 + bd - 8);
     for svar in scale..(scale << 2) {
       let float = ((scale << 1) as f64 / svar as f64).cbrt();
-      let fixed =
-        apply_ssim_boost(1 << 23, svar, svar) as f64 / (1 << 23) as f64;
+      let fixed = match bd {
+        8 => apply_ssim_boost::<8>(1 << 23, svar, svar),
+        10 => apply_ssim_boost::<10>(1 << 23, svar, svar),
+        12 => apply_ssim_boost::<12>(1 << 23, svar, svar),
+        _ => unimplemented!(),
+      } as f64
+        / (1 << 23) as f64;
 
       // Compare the two versions
       max_relative_error =
diff --git a/src/api/test.rs b/src/api/test.rs
index 0a698ba4d5..12618388fc 100644
--- a/src/api/test.rs
+++ b/src/api/test.rs
@@ -2274,7 +2274,7 @@ fn min_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert_eq!(68, frame_data.fi.base_q_idx);
@@ -2305,7 +2305,7 @@ fn min_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert!(frame_data.fi.base_q_idx > 68);
@@ -2339,7 +2339,7 @@ fn max_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert_eq!(95, frame_data.fi.base_q_idx);
@@ -2370,7 +2370,7 @@ fn max_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert!(frame_data.fi.base_q_idx < 95);
diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs
index 3ef711aca9..dabf3256c3 100644
--- a/src/asm/shared/predict.rs
+++ b/src/asm/shared/predict.rs
@@ -21,7 +21,6 @@ mod test {
   #[test]
   fn pred_matches_u8() {
     let tx_size = TxSize::TX_4X4;
-    let bit_depth = 8;
     let cpu = CpuFeatureLevel::default();
     let ac = [0i16; 32 * 32];
     // SAFETY: We write to the array below before reading from it.
@@ -73,12 +72,11 @@ mod test {
       for angle in angles {
         let expected = {
           let mut plane = Plane::from_slice(&[0u8; 4 * 4], 4);
-          rust::dispatch_predict_intra(
+          rust::dispatch_predict_intra::<_, 8>(
             *mode,
             *variant,
             &mut plane.as_region_mut(),
             tx_size,
-            bit_depth,
             &ac,
             *angle,
             None,
@@ -93,12 +91,11 @@ mod test {
         };
 
         let mut output = Plane::from_slice(&[0u8; 4 * 4], 4);
-        dispatch_predict_intra(
+        dispatch_predict_intra::<_, 8>(
           *mode,
           *variant,
           &mut output.as_region_mut(),
           tx_size,
-          bit_depth,
           &ac,
           *angle,
           None,
diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs
index 1ab8be9099..50e8801a78 100644
--- a/src/asm/x86/cdef.rs
+++ b/src/asm/x86/cdef.rs
@@ -322,8 +322,8 @@ mod test {
             // FIXME: Remove `allow` once https://github.com/rust-lang/rust-clippy/issues/8264 fixed
             #[allow(clippy::undocumented_unsafe_blocks)]
             unsafe {
-              cdef_filter_block::<_, _, 8>(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              cdef_filter_block::<_, _, 8>(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
+              cdef_filter_block::<_, 8>(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              cdef_filter_block::<_, 8>(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
               assert_eq!(rust_dst.data_origin(), dst.data_origin());
             }
           }
diff --git a/src/asm/x86/dist/cdef_dist.rs b/src/asm/x86/dist/cdef_dist.rs
index 4b2ab541b3..e8119d9d88 100644
--- a/src/asm/x86/dist/cdef_dist.rs
+++ b/src/asm/x86/dist/cdef_dist.rs
@@ -342,14 +342,14 @@ pub mod test {
 
   #[test]
   fn cdef_dist_simd_large_diff_hbd() {
-    cdef_diff_tester::<_, 10>(max_diff_planes::<u16, 10>);
+    cdef_diff_tester::<_, 10>(max_diff_planes::<u16>);
     cdef_diff_tester::<_, 12>(max_diff_planes::<u16>);
   }
 
   fn cdef_diff_tester<T: Pixel, const BD: usize>(
     gen_planes: fn(bd: usize) -> (Plane<T>, Plane<T>),
   ) {
-    let (src_plane, dst_plane) = gen_planes(bd);
+    let (src_plane, dst_plane) = gen_planes(BD);
 
     let mut fail = false;
 
@@ -361,7 +361,7 @@ pub mod test {
         let src_region = src_plane.region(area);
         let dst_region = dst_plane.region(area);
 
-        let rust = rust::cdef_dist_kernel(
+        let rust = rust::cdef_dist_kernel::<_, BD>(
           &src_region,
           &dst_region,
           w,
@@ -369,7 +369,7 @@ pub mod test {
           CpuFeatureLevel::default(),
         );
 
-        let simd = cdef_dist_kernel(
+        let simd = cdef_dist_kernel::<_, BD>(
           &src_region,
           &dst_region,
           w,
diff --git a/src/asm/x86/dist/mod.rs b/src/asm/x86/dist/mod.rs
index 9dc8207ff6..08f3381ab0 100644
--- a/src/asm/x86/dist/mod.rs
+++ b/src/asm/x86/dist/mod.rs
@@ -735,12 +735,12 @@ mod test {
   use rand::random;
   use std::str::FromStr;
 
-  macro_rules! test_dist_fns {
-    ($(($W:expr, $H:expr)),*, $DIST_TY:ident, $BD:expr, $OPT:ident, $OPTLIT:tt) => {
+  macro_rules! test_dist_sad_fns {
+    ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => {
       $(
         paste::item! {
           #[test]
-          fn [<get_ $DIST_TY _ $W x $H _bd_ $BD _ $OPT>]() {
+          fn [<get_sad _ $W x $H _bd_ $BD _ $OPT>]() {
             if !is_x86_feature_detected!($OPTLIT) {
               eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT);
               return;
@@ -755,8 +755,8 @@ mod test {
                 *s = random::<u8>() as u16 * $BD / 8;
                 *d = random::<u8>() as u16 * $BD / 8;
               }
-              let result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+              let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             } else {
@@ -768,8 +768,8 @@ mod test {
                 *s = random::<u8>();
                 *d = random::<u8>();
               }
-              let result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>]::<$BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+              let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             }
@@ -779,7 +779,51 @@ mod test {
     }
   }
 
-  test_dist_fns!(
+  macro_rules! test_dist_satd_fns {
+    ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => {
+      $(
+        paste::item! {
+          #[test]
+          fn [<get_satd_ $W x $H _bd_ $BD _ $OPT>]() {
+            if !is_x86_feature_detected!($OPTLIT) {
+              eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT);
+              return;
+            }
+
+            if $BD > 8 {
+              // dynamic allocation: test
+              let mut src = Plane::from_slice(&[0u16; $W * $H], $W);
+              // dynamic allocation: test
+              let mut dst = Plane::from_slice(&[0u16; $W * $H], $W);
+              for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) {
+                *s = random::<u8>() as u16 * $BD / 8;
+                *d = random::<u8>() as u16 * $BD / 8;
+              }
+              let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+
+              assert_eq!(rust_result, result);
+            } else {
+              // dynamic allocation: test
+              let mut src = Plane::from_slice(&[0u8; $W * $H], $W);
+              // dynamic allocation: test
+              let mut dst = Plane::from_slice(&[0u8; $W * $H], $W);
+              for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) {
+                *s = random::<u8>();
+                *d = random::<u8>();
+              }
+              let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+
+              assert_eq!(rust_result, result);
+            }
+          }
+        }
+      )*
+    }
+  }
+
+  test_dist_sad_fns!(
     (4, 4),
     (16, 16),
     (8, 8),
@@ -802,13 +846,12 @@ mod test {
     (128, 64),
     (16, 64),
     (64, 16),
-    sad,
-    10,
     ssse3,
-    "ssse3"
+    "ssse3",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (4, 4),
     (16, 16),
     (8, 8),
@@ -831,13 +874,12 @@ mod test {
     (128, 64),
     (16, 64),
     (64, 16),
-    sad,
-    10,
     avx2,
-    "avx2"
+    "avx2",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (4, 4),
     (4, 8),
     (4, 16),
@@ -849,13 +891,12 @@ mod test {
     (32, 32),
     (64, 64),
     (128, 128),
-    sad,
-    8,
     sse2,
-    "sse2"
+    "sse2",
+    8
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (16, 4),
     (16, 8),
     (16, 16),
@@ -871,17 +912,16 @@ mod test {
     (64, 128),
     (128, 64),
     (128, 128),
-    sad,
-    8,
     avx2,
-    "avx2"
+    "avx2",
+    8
   );
 
-  test_dist_fns!((8, 8), satd, 8, ssse3, "ssse3");
+  test_dist_satd_fns!((8, 8), ssse3, "ssse3", 8);
 
-  test_dist_fns!((4, 4), satd, 8, sse4, "sse4.1");
+  test_dist_satd_fns!((4, 4), sse4, "sse4.1", 8);
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -904,13 +944,12 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    8,
     avx2,
-    "avx2"
+    "avx2",
+    8
   );
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -933,13 +972,12 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    10,
     avx2,
-    "avx2"
+    "avx2",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -962,9 +1000,8 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    12,
     avx2,
-    "avx2"
+    "avx2",
+    12
   );
 }
diff --git a/src/dist.rs b/src/dist.rs
index 453dfa99a1..aaf2e3e289 100644
--- a/src/dist.rs
+++ b/src/dist.rs
@@ -440,7 +440,6 @@ pub mod test {
       (64, 16, 116384),
     ];
 
-    let bit_depth: usize = 8;
     let (input_plane, rec_plane) = setup_planes::<T>();
 
     for (w, h, distortion) in blocks {
diff --git a/src/predict.rs b/src/predict.rs
index 1de1b7f8bc..0c3098233d 100644
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -1508,7 +1508,7 @@ mod test {
     pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [30u8; 16]);
 
-    pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4);
+    pred_dc_128::<_, 8>(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [128u8; 16]);
 
     pred_v(&mut output.as_region_mut(), above, 4, 4);
diff --git a/src/transform/mod.rs b/src/transform/mod.rs
index 09c4824061..d05dc6dfe5 100644
--- a/src/transform/mod.rs
+++ b/src/transform/mod.rs
@@ -450,7 +450,7 @@ mod test {
   use crate::frame::*;
   use rand::random;
 
-  fn test_roundtrip<T: Pixel>(
+  fn test_roundtrip<T: Pixel, const BD: usize>(
     tx_size: TxSize, tx_type: TxType, tolerance: i16,
   ) {
     let cpu = CpuFeatureLevel::default();
@@ -474,7 +474,7 @@ mod test {
       *d = T::cast_from(random::<u8>());
       *r = i16::cast_from(*s) - i16::cast_from(*d);
     }
-    forward_transform::<_, 8>(
+    forward_transform::<_, BD>(
       res,
       freq,
       tx_size.width(),
@@ -482,7 +482,7 @@ mod test {
       tx_type,
       cpu,
     );
-    inverse_transform_add(
+    inverse_transform_add::<_, BD>(
       freq,
       &mut dst.as_region_mut(),
       coeff_area,
@@ -532,7 +532,7 @@ mod test {
     }
   }
 
-  fn roundtrips<T: Pixel>() {
+  fn roundtrips<T: Pixel, const BD: usize>() {
     let combinations = [
       (TX_4X4, DCT_DCT, 0),
       (TX_4X4, ADST_DCT, 0),
@@ -583,17 +583,17 @@ mod test {
     ];
     for &(tx_size, tx_type, tolerance) in combinations.iter() {
       println!("Testing combination {:?}, {:?}", tx_size, tx_type);
-      test_roundtrip::<T>(tx_size, tx_type, tolerance);
+      test_roundtrip::<T, BD>(tx_size, tx_type, tolerance);
     }
   }
 
   #[test]
   fn roundtrips_u8() {
-    roundtrips::<u8>();
+    roundtrips::<u8, 8>();
   }
 
   #[test]
   fn roundtrips_u16() {
-    roundtrips::<u16>();
+    roundtrips::<u16, 10>();
   }
 }