diff --git a/src/activity.rs b/src/activity.rs
index 5a8400d978..facca82125 100644
--- a/src/activity.rs
+++ b/src/activity.rs
@@ -56,11 +56,11 @@ impl ActivityMask {
   }
 
   #[hawktracer(activity_mask_fill_scales)]
-  pub fn fill_scales(
-    &self, bit_depth: usize, activity_scales: &mut Box<[DistortionScale]>,
+  pub fn fill_scales<const BD: usize>(
+    &self, activity_scales: &mut Box<[DistortionScale]>,
   ) {
     for (dst, &src) in activity_scales.iter_mut().zip(self.variances.iter()) {
-      *dst = ssim_boost(src, src, bit_depth);
+      *dst = ssim_boost::<BD>(src, src);
     }
   }
 }
@@ -146,21 +146,20 @@ fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput {
 }
 
 #[inline(always)]
-pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale {
-  DistortionScale(apply_ssim_boost(
+pub fn ssim_boost<const BD: usize>(svar: u32, dvar: u32) -> DistortionScale {
+  DistortionScale(apply_ssim_boost::<BD>(
     DistortionScale::default().0,
     svar,
     dvar,
-    bit_depth,
   ))
 }
 
 /// Apply ssim boost to a given input
 #[inline(always)]
-pub fn apply_ssim_boost(
-  input: u32, svar: u32, dvar: u32, bit_depth: usize,
+pub fn apply_ssim_boost<const BD: usize>(
+  input: u32, svar: u32, dvar: u32,
 ) -> u32 {
-  let coeff_shift = bit_depth - 8;
+  let coeff_shift = BD - 8;
 
   // Scale dvar and svar to lbd range to prevent overflows.
   let svar = (svar >> (2 * coeff_shift)) as u64;
@@ -199,7 +198,7 @@ mod ssim_boost_tests {
     let max_pix_diff = (1 << 12) - 1;
     let max_pix_sse = max_pix_diff * max_pix_diff;
     let max_variance = max_pix_diff * 8 * 8 / 4;
-    apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12);
+    apply_ssim_boost::<12>(max_pix_sse * 8 * 8, max_variance, max_variance);
   }
 
   /// Floating point reference version of `ssim_boost`
@@ -234,8 +233,8 @@ mod ssim_boost_tests {
         let dvar = rng.gen_range(0..(1 << scale));
 
         let float = reference_ssim_boost(svar, dvar, 12);
-        let fixed =
-          apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64;
+        let fixed = apply_ssim_boost::<12>(1 << 23, svar, dvar) as f64
+          / (1 << 23) as f64;
 
         // Compare the two versions
         max_relative_error =
@@ -259,8 +258,13 @@ mod ssim_boost_tests {
     let scale = ((1 << bd) - 1) << (6 - 2 + bd - 8);
     for svar in scale..(scale << 2) {
       let float = ((scale << 1) as f64 / svar as f64).cbrt();
-      let fixed =
-        apply_ssim_boost(1 << 23, svar, svar, bd) as f64 / (1 << 23) as f64;
+      let fixed = match bd {
+        8 => apply_ssim_boost::<8>(1 << 23, svar, svar),
+        10 => apply_ssim_boost::<10>(1 << 23, svar, svar),
+        12 => apply_ssim_boost::<12>(1 << 23, svar, svar),
+        _ => unimplemented!(),
+      } as f64
+        / (1 << 23) as f64;
 
       // Compare the two versions
       max_relative_error =
diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs
index fbb5ad3e5b..42f3b211a3 100644
--- a/src/api/config/mod.rs
+++ b/src/api/config/mod.rs
@@ -248,8 +248,15 @@ impl Config {
     // First-pass parameters depend on whether second-pass is in effect.
     // So `init_first_pass` must follow `init_second_pass`.
     if self.rate_control.emit_pass_data {
-      let maybe_pass1_log_base_q = (self.rate_control.summary.is_none())
-        .then(|| inner.rc_state.select_pass1_log_base_q(&inner, 0));
+      let maybe_pass1_log_base_q =
+        (self.rate_control.summary.is_none()).then(|| {
+          match self.enc.bit_depth {
+            8 => inner.rc_state.select_pass1_log_base_q::<_, 8>(&inner, 0),
+            10 => inner.rc_state.select_pass1_log_base_q::<_, 10>(&inner, 0),
+            12 => inner.rc_state.select_pass1_log_base_q::<_, 12>(&inner, 0),
+            _ => unimplemented!(),
+          }
+        });
       inner.rc_state.init_first_pass(maybe_pass1_log_base_q);
     }
 
diff --git a/src/api/context.rs b/src/api/context.rs
index 58d697fd61..9366c53fbb 100644
--- a/src/api/context.rs
+++ b/src/api/context.rs
@@ -129,7 +129,12 @@ impl<T: Pixel> Context<T> {
     }
 
     let inner = &mut self.inner;
-    let run = move || inner.send_frame(frame, params);
+    let run = move || match inner.config.bit_depth {
+      8 => inner.send_frame::<8>(frame, params),
+      10 => inner.send_frame::<10>(frame, params),
+      12 => inner.send_frame::<12>(frame, params),
+      _ => unimplemented!(),
+    };
 
     match &self.pool {
       Some(pool) => pool.install(run),
@@ -302,7 +307,12 @@ impl<T: Pixel> Context<T> {
   #[inline]
   pub fn receive_packet(&mut self) -> Result<Packet<T>, EncoderStatus> {
     let inner = &mut self.inner;
-    let mut run = move || inner.receive_packet();
+    let mut run = move || match inner.config.bit_depth {
+      8 => inner.receive_packet::<8>(),
+      10 => inner.receive_packet::<10>(),
+      12 => inner.receive_packet::<12>(),
+      _ => unimplemented!(),
+    };
 
     match &self.pool {
       Some(pool) => pool.install(run),
diff --git a/src/api/internal.rs b/src/api/internal.rs
index 1a978de836..5c379d5a86 100644
--- a/src/api/internal.rs
+++ b/src/api/internal.rs
@@ -317,7 +317,7 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(send_frame)]
-  pub fn send_frame(
+  pub fn send_frame<const BD: usize>(
     &mut self, mut frame: Option<Arc<Frame<T>>>,
     params: Option<FrameParameters>,
   ) -> Result<(), EncoderStatus> {
@@ -376,7 +376,7 @@ impl<T: Pixel> ContextInner<T> {
             break;
           }
 
-          Self::compute_keyframe_placement(
+          Self::compute_keyframe_placement::<BD>(
             cur_lookahead_frames,
             &self.keyframes_forced,
             &mut self.keyframe_detector,
@@ -385,7 +385,7 @@ impl<T: Pixel> ContextInner<T> {
           );
         }
       } else {
-        Self::compute_keyframe_placement(
+        Self::compute_keyframe_placement::<BD>(
           &lookahead_frames,
           &self.keyframes_forced,
           &mut self.keyframe_detector,
@@ -395,7 +395,7 @@ impl<T: Pixel> ContextInner<T> {
       }
     }
 
-    self.compute_frame_invariants();
+    self.compute_frame_invariants::<BD>();
 
     Ok(())
   }
@@ -649,7 +649,9 @@ impl<T: Pixel> ContextInner<T> {
   /// function must be called after every new `FrameInvariants` is initially
   /// computed.
   #[hawktracer(compute_lookahead_motion_vectors)]
-  fn compute_lookahead_motion_vectors(&mut self, output_frameno: u64) {
+  fn compute_lookahead_motion_vectors<const BD: usize>(
+    &mut self, output_frameno: u64,
+  ) {
     let frame_data = self.frame_data.get(&output_frameno).unwrap();
 
     // We're only interested in valid frames which are not show-existing-frame.
@@ -665,7 +667,7 @@ impl<T: Pixel> ContextInner<T> {
 
     let qps = {
       let fti = frame_data.as_ref().unwrap().fi.get_frame_subtype();
-      self.rc_state.select_qi(
+      self.rc_state.select_qi::<_, BD>(
         self,
         output_frameno,
         fti,
@@ -742,14 +744,14 @@ impl<T: Pixel> ContextInner<T> {
     fi.rec_buffer = coded_data.lookahead_rec_buffer.clone();
 
     // Estimate lambda with rate-control dry-run
-    fi.set_quantizers(&qps);
+    fi.set_quantizers::<BD>(&qps);
 
     // TODO: as in the encoding code, key frames will have no references.
     // However, for block importance purposes we want key frames to act as
     // P-frames in this instance.
     //
     // Compute the motion vectors.
-    compute_motion_vectors(fi, fs, &self.inter_cfg);
+    compute_motion_vectors::<_, BD>(fi, fs, &self.inter_cfg);
 
     let coded_data = fi.coded_frame_data.as_mut().unwrap();
 
@@ -818,7 +820,9 @@ impl<T: Pixel> ContextInner<T> {
   /// Computes lookahead intra cost approximations and fills in
   /// `lookahead_intra_costs` on the `FrameInvariants`.
   #[hawktracer(compute_lookahead_intra_costs)]
-  fn compute_lookahead_intra_costs(&mut self, output_frameno: u64) {
+  fn compute_lookahead_intra_costs<const BD: usize>(
+    &mut self, output_frameno: u64,
+  ) {
     let frame_data = self.frame_data.get(&output_frameno).unwrap();
     let fd = &frame_data.as_ref();
 
@@ -853,23 +857,22 @@ impl<T: Pixel> ContextInner<T> {
 
         // We use the cached values from scenechange if available,
         // otherwise we need to calculate them here.
-        estimate_intra_costs(
+        estimate_intra_costs::<_, BD>(
           temp_plane,
           &**frame,
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         )
       });
   }
 
   #[hawktracer(compute_keyframe_placement)]
-  pub fn compute_keyframe_placement(
+  pub fn compute_keyframe_placement<const BD: usize>(
     lookahead_frames: &[&Arc<Frame<T>>], keyframes_forced: &BTreeSet<u64>,
     keyframe_detector: &mut SceneChangeDetector<T>,
     next_lookahead_frame: &mut u64, keyframes: &mut BTreeSet<u64>,
   ) {
     if keyframes_forced.contains(next_lookahead_frame)
-      || keyframe_detector.analyze_next_frame(
+      || keyframe_detector.analyze_next_frame::<BD>(
         lookahead_frames,
         *next_lookahead_frame,
         *keyframes.iter().last().unwrap(),
@@ -882,24 +885,26 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(compute_frame_invariants)]
-  pub fn compute_frame_invariants(&mut self) {
+  pub fn compute_frame_invariants<const BD: usize>(&mut self) {
     while self.set_frame_properties(self.next_lookahead_output_frameno).is_ok()
     {
-      self
-        .compute_lookahead_motion_vectors(self.next_lookahead_output_frameno);
+      self.compute_lookahead_motion_vectors::<BD>(
+        self.next_lookahead_output_frameno,
+      );
       if self.config.temporal_rdo() {
-        self.compute_lookahead_intra_costs(self.next_lookahead_output_frameno);
+        self.compute_lookahead_intra_costs::<BD>(
+          self.next_lookahead_output_frameno,
+        );
       }
       self.next_lookahead_output_frameno += 1;
     }
   }
 
   #[hawktracer(update_block_importances)]
-  fn update_block_importances(
+  fn update_block_importances<const BD: usize>(
     fi: &FrameInvariants<T>, me_stats: &crate::me::FrameMEStats,
-    frame: &Frame<T>, reference_frame: &Frame<T>, bit_depth: usize,
-    bsize: BlockSize, len: usize,
-    reference_frame_block_importances: &mut [f32],
+    frame: &Frame<T>, reference_frame: &Frame<T>, bsize: BlockSize,
+    len: usize, reference_frame_block_importances: &mut [f32],
   ) {
     let coded_data = fi.coded_frame_data.as_ref().unwrap();
     let plane_org = &frame.planes[0];
@@ -946,12 +951,11 @@ impl<T: Pixel> ContextInner<T> {
                 height: IMPORTANCE_BLOCK_SIZE,
               });
 
-              let inter_cost = get_satd(
+              let inter_cost = get_satd::<_, BD>(
                 &region_org,
                 &region_ref,
                 bsize.width(),
                 bsize.height(),
-                bit_depth,
                 fi.cpu_feature_level,
               ) as f32;
 
@@ -1058,7 +1062,7 @@ impl<T: Pixel> ContextInner<T> {
 
   /// Computes the block importances for the current output frame.
   #[hawktracer(compute_block_importances)]
-  fn compute_block_importances(&mut self) {
+  fn compute_block_importances<const BD: usize>(&mut self) {
     // SEF don't need block importances.
     if self.frame_data[&self.output_frameno]
       .as_ref()
@@ -1142,7 +1146,6 @@ impl<T: Pixel> ContextInner<T> {
           }
         }
 
-        let bit_depth = self.config.bit_depth;
         let frame_data = &mut self.frame_data;
         let len = unique_indices.len();
 
@@ -1178,12 +1181,11 @@ impl<T: Pixel> ContextInner<T> {
                 .block_importances
             })
           {
-            Self::update_block_importances(
+            Self::update_block_importances::<BD>(
               fi,
               me_stats,
               frame,
               reference_frame,
-              bit_depth,
               bsize,
               len,
               reference_frame_block_importances,
@@ -1244,7 +1246,7 @@ impl<T: Pixel> ContextInner<T> {
     }
   }
 
-  pub(crate) fn encode_packet(
+  pub(crate) fn encode_packet<const BD: usize>(
     &mut self, cur_output_frameno: u64,
   ) -> Result<Packet<T>, EncoderStatus> {
     if self
@@ -1325,10 +1327,9 @@ impl<T: Pixel> ContextInner<T> {
             self.frame_q[&frame_data.fi.input_frameno].as_ref().unwrap();
           coded_data.activity_mask =
             ActivityMask::from_plane(&frame.planes[0]);
-          coded_data.activity_mask.fill_scales(
-            frame_data.fi.sequence.bit_depth,
-            &mut coded_data.activity_scales,
-          );
+          coded_data
+            .activity_mask
+            .fill_scales::<BD>(&mut coded_data.activity_scales);
           log_isqrt_mean_scale = coded_data.compute_spatiotemporal_scores();
         } else {
           coded_data.activity_mask = ActivityMask::default();
@@ -1359,19 +1360,22 @@ impl<T: Pixel> ContextInner<T> {
       }
 
       let fti = frame_data.fi.get_frame_subtype();
-      let qps = self.rc_state.select_qi(
+      let qps = self.rc_state.select_qi::<_, BD>(
         self,
         cur_output_frameno,
         fti,
         self.maybe_prev_log_base_q,
         log_isqrt_mean_scale,
       );
-      frame_data.fi.set_quantizers(&qps);
+      frame_data.fi.set_quantizers::<BD>(&qps);
 
       if self.rc_state.needs_trial_encode(fti) {
         let mut trial_fs = frame_data.fs.clone();
-        let data =
-          encode_frame(&frame_data.fi, &mut trial_fs, &self.inter_cfg);
+        let data = encode_frame::<_, BD>(
+          &frame_data.fi,
+          &mut trial_fs,
+          &self.inter_cfg,
+        );
         self.rc_state.update_state(
           (data.len() * 8) as i64,
           fti,
@@ -1380,18 +1384,21 @@ impl<T: Pixel> ContextInner<T> {
           true,
           false,
         );
-        let qps = self.rc_state.select_qi(
+        let qps = self.rc_state.select_qi::<_, BD>(
           self,
           cur_output_frameno,
           fti,
           self.maybe_prev_log_base_q,
           log_isqrt_mean_scale,
         );
-        frame_data.fi.set_quantizers(&qps);
+        frame_data.fi.set_quantizers::<BD>(&qps);
       }
 
-      let data =
-        encode_frame(&frame_data.fi, &mut frame_data.fs, &self.inter_cfg);
+      let data = encode_frame::<_, BD>(
+        &frame_data.fi,
+        &mut frame_data.fs,
+        &self.inter_cfg,
+      );
       #[cfg(feature = "dump_lookahead_data")]
       {
         let input_frameno = frame_data.fi.input_frameno;
@@ -1488,7 +1495,9 @@ impl<T: Pixel> ContextInner<T> {
   }
 
   #[hawktracer(receive_packet)]
-  pub fn receive_packet(&mut self) -> Result<Packet<T>, EncoderStatus> {
+  pub fn receive_packet<const BD: usize>(
+    &mut self,
+  ) -> Result<Packet<T>, EncoderStatus> {
     if self.done_processing() {
       return Err(EncoderStatus::LimitReached);
     }
@@ -1514,12 +1523,12 @@ impl<T: Pixel> ContextInner<T> {
 
     if self.config.temporal_rdo() {
       // Compute the block importances for the current output frame.
-      self.compute_block_importances();
+      self.compute_block_importances::<BD>();
     }
 
     let cur_output_frameno = self.output_frameno;
 
-    let mut ret = self.encode_packet(cur_output_frameno);
+    let mut ret = self.encode_packet::<BD>(cur_output_frameno);
 
     if let Ok(ref mut pkt) = ret {
       self.garbage_collect(pkt.input_frameno);
diff --git a/src/api/lookahead.rs b/src/api/lookahead.rs
index 2758d5920b..81a495d7d7 100644
--- a/src/api/lookahead.rs
+++ b/src/api/lookahead.rs
@@ -27,8 +27,8 @@ pub(crate) const IMP_BLOCK_AREA_IN_MV_UNITS: i64 =
   IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS;
 
 #[hawktracer(estimate_intra_costs)]
-pub(crate) fn estimate_intra_costs<T: Pixel>(
-  temp_plane: &mut Plane<T>, frame: &Frame<T>, bit_depth: usize,
+pub(crate) fn estimate_intra_costs<T: Pixel, const BD: usize>(
+  temp_plane: &mut Plane<T>, frame: &Frame<T>,
   cpu_feature_level: CpuFeatureLevel,
 ) -> Box<[u32]> {
   let plane = &frame.planes[0];
@@ -54,7 +54,7 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
       });
 
       // TODO: other intra prediction modes.
-      let edge_buf = get_intra_edges(
+      let edge_buf = get_intra_edges::<_, BD>(
         &plane.as_region(),
         TileBlockOffset(BlockOffset { x, y }),
         0,
@@ -65,7 +65,6 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           y: (y * IMPORTANCE_BLOCK_SIZE) as isize,
         },
         TxSize::TX_8X8,
-        bit_depth,
         Some(PredictionMode::DC_PRED),
         false,
         IntraParam::None,
@@ -79,7 +78,7 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           height: IMPORTANCE_BLOCK_SIZE,
         });
 
-      PredictionMode::DC_PRED.predict_intra(
+      PredictionMode::DC_PRED.predict_intra::<_, BD>(
         TileRect {
           x: x * IMPORTANCE_BLOCK_SIZE,
           y: y * IMPORTANCE_BLOCK_SIZE,
@@ -88,7 +87,6 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
         },
         &mut plane_after_prediction_region,
         tx_size,
-        bit_depth,
         &[], // Not used by DC_PRED
         IntraParam::None,
         None, // Not used by DC_PRED
@@ -104,12 +102,11 @@ pub(crate) fn estimate_intra_costs<T: Pixel>(
           height: IMPORTANCE_BLOCK_SIZE,
         });
 
-      let intra_cost = get_satd(
+      let intra_cost = get_satd::<_, BD>(
         &plane_org,
         &plane_after_prediction_region,
         bsize.width(),
         bsize.height(),
-        bit_depth,
         cpu_feature_level,
       );
 
@@ -177,9 +174,9 @@ pub(crate) fn estimate_importance_block_difference<T: Pixel>(
 }
 
 #[hawktracer(estimate_inter_costs)]
-pub(crate) fn estimate_inter_costs<T: Pixel>(
-  frame: Arc<Frame<T>>, ref_frame: Arc<Frame<T>>, bit_depth: usize,
-  mut config: EncoderConfig, sequence: Arc<Sequence>, buffer: RefMEStats,
+pub(crate) fn estimate_inter_costs<T: Pixel, const BD: usize>(
+  frame: Arc<Frame<T>>, ref_frame: Arc<Frame<T>>, mut config: EncoderConfig,
+  sequence: Arc<Sequence>, buffer: RefMEStats,
 ) -> f64 {
   config.low_latency = true;
   config.speed_settings.multiref = false;
@@ -215,7 +212,7 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
       ],
     }),
   );
-  compute_motion_vectors(&mut fi, &mut fs, &inter_cfg);
+  compute_motion_vectors::<_, BD>(&mut fi, &mut fs, &inter_cfg);
 
   // Estimate inter costs
   let plane_org = &frame.planes[0];
@@ -252,12 +249,11 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
         height: IMPORTANCE_BLOCK_SIZE,
       });
 
-      inter_costs += get_satd(
+      inter_costs += get_satd::<_, BD>(
         &region_org,
         &region_ref,
         bsize.width(),
         bsize.height(),
-        bit_depth,
         fi.cpu_feature_level,
       ) as u64;
     });
@@ -266,7 +262,7 @@ pub(crate) fn estimate_inter_costs<T: Pixel>(
 }
 
 #[hawktracer(compute_motion_vectors)]
-pub(crate) fn compute_motion_vectors<T: Pixel>(
+pub(crate) fn compute_motion_vectors<T: Pixel, const BD: usize>(
   fi: &mut FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) {
   let mut blocks = FrameBlocks::new(fi.w_in_b, fi.h_in_b);
@@ -277,6 +273,6 @@ pub(crate) fn compute_motion_vectors<T: Pixel>(
     .into_par_iter()
     .for_each(|mut ctx| {
       let ts = &mut ctx.ts;
-      estimate_tile_motion(fi, ts, inter_cfg);
+      estimate_tile_motion::<_, BD>(fi, ts, inter_cfg);
     });
 }
diff --git a/src/api/test.rs b/src/api/test.rs
index 0a698ba4d5..12618388fc 100644
--- a/src/api/test.rs
+++ b/src/api/test.rs
@@ -2274,7 +2274,7 @@ fn min_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert_eq!(68, frame_data.fi.base_q_idx);
@@ -2305,7 +2305,7 @@ fn min_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert!(frame_data.fi.base_q_idx > 68);
@@ -2339,7 +2339,7 @@ fn max_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert_eq!(95, frame_data.fi.base_q_idx);
@@ -2370,7 +2370,7 @@ fn max_quantizer_bounds_correctly() {
   ctx.flush();
 
   for i in 0..limit {
-    ctx.inner.encode_packet(i).unwrap();
+    ctx.inner.encode_packet::<8>(i).unwrap();
     let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap();
     if i == 0 {
       assert!(frame_data.fi.base_q_idx < 95);
diff --git a/src/asm/aarch64/cdef.rs b/src/asm/aarch64/cdef.rs
index 2fe70e1248..1c04e50f51 100644
--- a/src/asm/aarch64/cdef.rs
+++ b/src/asm/aarch64/cdef.rs
@@ -67,8 +67,8 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize {
 
 pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
   dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize,
-  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-  bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel,
+  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize,
+  ydec: usize, edges: u8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<T>| {
     rust::cdef_filter_block(
@@ -79,7 +79,6 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
       sec_strength,
       dir,
       damping,
-      bit_depth,
       xdec,
       ydec,
       edges,
diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs
index 3ef711aca9..dabf3256c3 100644
--- a/src/asm/shared/predict.rs
+++ b/src/asm/shared/predict.rs
@@ -21,7 +21,6 @@ mod test {
   #[test]
   fn pred_matches_u8() {
     let tx_size = TxSize::TX_4X4;
-    let bit_depth = 8;
     let cpu = CpuFeatureLevel::default();
     let ac = [0i16; 32 * 32];
     // SAFETY: We write to the array below before reading from it.
@@ -73,12 +72,11 @@ mod test {
       for angle in angles {
         let expected = {
           let mut plane = Plane::from_slice(&[0u8; 4 * 4], 4);
-          rust::dispatch_predict_intra(
+          rust::dispatch_predict_intra::<_, 8>(
             *mode,
             *variant,
             &mut plane.as_region_mut(),
             tx_size,
-            bit_depth,
             &ac,
             *angle,
             None,
@@ -93,12 +91,11 @@ mod test {
         };
 
         let mut output = Plane::from_slice(&[0u8; 4 * 4], 4);
-        dispatch_predict_intra(
+        dispatch_predict_intra::<_, 8>(
           *mode,
           *variant,
           &mut output.as_region_mut(),
           tx_size,
-          bit_depth,
           &ac,
           *angle,
           None,
diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
index d34286bec7..94cb328702 100644
--- a/src/asm/shared/transform/inverse.rs
+++ b/src/asm/shared/transform/inverse.rs
@@ -17,11 +17,11 @@ pub type InvTxfmFunc =
 pub type InvTxfmHBDFunc =
   unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32);
 
-pub fn call_inverse_func<T: Pixel>(
+pub fn call_inverse_func<T: Pixel, const BD: usize>(
   func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
-  eob: usize, width: usize, height: usize, bd: usize,
+  eob: usize, width: usize, height: usize,
 ) {
-  debug_assert!(bd == 8);
+  debug_assert!(BD == 8);
 
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
@@ -51,7 +51,6 @@ pub fn call_inverse_func<T: Pixel>(
 pub fn call_inverse_hbd_func<T: Pixel>(
   func: InvTxfmHBDFunc, input: &[T::Coeff],
   output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
-  _bd: usize,
 ) {
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
@@ -161,35 +160,32 @@ pub mod test {
         *d = random::<u8>();
         *r = i16::from(*s) - i16::from(*d);
       }
-      forward_transform(
+      forward_transform::<_, 8>(
         res,
         freq,
         tx_size.width(),
         tx_size,
         tx_type,
-        8,
         CpuFeatureLevel::RUST,
       );
 
       let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();
 
-      inverse_transform_add(
+      inverse_transform_add::<_, 8>(
         freq,
         &mut dst.as_region_mut(),
         eob,
         tx_size,
         tx_type,
-        8,
         cpu,
       );
-      inverse_transform_add(
+      inverse_transform_add::<_, 8>(
         freq,
         &mut rust_dst.as_region_mut(),
         eob,
         tx_size,
         tx_type,
-        8,
         CpuFeatureLevel::RUST,
       );
       assert_eq!(rust_dst.data_origin(), dst.data_origin());
diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs
index 8892429052..50e8801a78 100644
--- a/src/asm/x86/cdef.rs
+++ b/src/asm/x86/cdef.rs
@@ -41,13 +41,13 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize {
   ((ydec << 1) | xdec) & 3
 }
 
-pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
+pub(crate) unsafe fn cdef_filter_block<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize,
-  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-  bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel,
+  pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, xdec: usize,
+  ydec: usize, edges: u8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<T>| {
-    rust::cdef_filter_block(
+    rust::cdef_filter_block::<_, _, BD>(
       dst,
       src,
       src_stride,
@@ -55,7 +55,6 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
       sec_strength,
       dir,
       damping,
-      bit_depth,
       xdec,
       ydec,
       edges,
@@ -124,7 +123,7 @@ pub(crate) unsafe fn cdef_filter_block<T: Pixel>(
               sec_strength,
               dir as i32,
               damping,
-              (1 << bit_depth) - 1,
+              (1 << BD) - 1,
             );
           }
           None => call_rust(dst),
@@ -316,7 +315,6 @@ mod test {
             let pri_strength = 1;
             let sec_strength = 0;
             let damping = 2;
-            let bit_depth = 8;
 
             // SAFETY: Calling functions with raw pointers--we created the
             // planes above and only read from the start.
@@ -324,8 +322,8 @@ mod test {
             // FIXME: Remove `allow` once https://github.com/rust-lang/rust-clippy/issues/8264 fixed
             #[allow(clippy::undocumented_unsafe_blocks)]
             unsafe {
-              cdef_filter_block(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              cdef_filter_block(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
+              cdef_filter_block::<_, 8>(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              cdef_filter_block::<_, 8>(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping,  $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST);
               assert_eq!(rust_dst.data_origin(), dst.data_origin());
             }
           }
diff --git a/src/asm/x86/dist/cdef_dist.rs b/src/asm/x86/dist/cdef_dist.rs
index 6b590d3730..e8119d9d88 100644
--- a/src/asm/x86/dist/cdef_dist.rs
+++ b/src/asm/x86/dist/cdef_dist.rs
@@ -53,9 +53,9 @@ extern {
 ///
 /// - If in `check_asm` mode, panics on mismatch between native and ASM results.
 #[allow(clippy::let_and_return)]
-pub fn cdef_dist_kernel<T: Pixel>(
+pub fn cdef_dist_kernel<T: Pixel, const BD: usize>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   debug_assert!(src.plane_cfg.xdec == 0);
   debug_assert!(src.plane_cfg.ydec == 0);
@@ -67,7 +67,7 @@ pub fn cdef_dist_kernel<T: Pixel>(
   debug_assert!(h <= 8);
 
   let call_rust =
-    || -> u32 { rust::cdef_dist_kernel(dst, src, w, h, bit_depth, cpu) };
+    || -> u32 { rust::cdef_dist_kernel::<_, BD>(dst, src, w, h, cpu) };
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
 
@@ -112,7 +112,7 @@ pub fn cdef_dist_kernel<T: Pixel>(
     }
   };
 
-  let dist = apply_ssim_boost(sse, svar, dvar, bit_depth);
+  let dist = apply_ssim_boost::<BD>(sse, svar, dvar);
   #[cfg(feature = "check_asm")]
   assert_eq!(
     dist, ref_dist,
@@ -315,41 +315,41 @@ pub mod test {
 
   #[test]
   fn cdef_dist_simd_random() {
-    cdef_diff_tester(8, random_planes::<u8>);
+    cdef_diff_tester::<_, 8>(random_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_random_hbd() {
-    cdef_diff_tester(10, random_planes::<u16>);
-    cdef_diff_tester(12, random_planes::<u16>);
+    cdef_diff_tester::<_, 10>(random_planes::<u16>);
+    cdef_diff_tester::<_, 12>(random_planes::<u16>);
   }
 
   #[test]
   fn cdef_dist_simd_large() {
-    cdef_diff_tester(8, max_planes::<u8>);
+    cdef_diff_tester::<_, 8>(max_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_large_hbd() {
-    cdef_diff_tester(10, max_planes::<u16>);
-    cdef_diff_tester(12, max_planes::<u16>);
+    cdef_diff_tester::<_, 10>(max_planes::<u16>);
+    cdef_diff_tester::<_, 12>(max_planes::<u16>);
   }
 
   #[test]
   fn cdef_dist_simd_large_diff() {
-    cdef_diff_tester(8, max_diff_planes::<u8>);
+    cdef_diff_tester::<_, 8>(max_diff_planes::<u8>);
   }
 
   #[test]
   fn cdef_dist_simd_large_diff_hbd() {
-    cdef_diff_tester(10, max_diff_planes::<u16>);
-    cdef_diff_tester(12, max_diff_planes::<u16>);
+    cdef_diff_tester::<_, 10>(max_diff_planes::<u16>);
+    cdef_diff_tester::<_, 12>(max_diff_planes::<u16>);
   }
 
-  fn cdef_diff_tester<T: Pixel>(
-    bd: usize, gen_planes: fn(bd: usize) -> (Plane<T>, Plane<T>),
+  fn cdef_diff_tester<T: Pixel, const BD: usize>(
+    gen_planes: fn(bd: usize) -> (Plane<T>, Plane<T>),
   ) {
-    let (src_plane, dst_plane) = gen_planes(bd);
+    let (src_plane, dst_plane) = gen_planes(BD);
 
     let mut fail = false;
 
@@ -361,21 +361,19 @@ pub mod test {
         let src_region = src_plane.region(area);
         let dst_region = dst_plane.region(area);
 
-        let rust = rust::cdef_dist_kernel(
+        let rust = rust::cdef_dist_kernel::<_, BD>(
           &src_region,
           &dst_region,
           w,
           h,
-          bd,
           CpuFeatureLevel::default(),
         );
 
-        let simd = cdef_dist_kernel(
+        let simd = cdef_dist_kernel::<_, BD>(
           &src_region,
           &dst_region,
           w,
           h,
-          bd,
           CpuFeatureLevel::default(),
         );
 
diff --git a/src/asm/x86/dist/mod.rs b/src/asm/x86/dist/mod.rs
index 676787adf3..08f3381ab0 100644
--- a/src/asm/x86/dist/mod.rs
+++ b/src/asm/x86/dist/mod.rs
@@ -286,11 +286,11 @@ pub(crate) const fn to_index(bsize: BlockSize) -> usize {
 #[allow(clippy::let_and_return)]
 pub fn get_sad<T: Pixel>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
-  let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, bit_depth, cpu) };
+  let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, cpu) };
 
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
@@ -338,13 +338,13 @@ pub fn get_sad<T: Pixel>(
 /// - If in `check_asm` mode, panics on mismatch between native and ASM results.
 #[inline(always)]
 #[allow(clippy::let_and_return)]
-pub fn get_satd<T: Pixel>(
+pub fn get_satd<T: Pixel, const BD: usize>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) -> u32 {
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
-  let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, bit_depth, cpu) };
+  let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, cpu) };
 
   #[cfg(feature = "check_asm")]
   let ref_dist = call_rust();
@@ -374,7 +374,7 @@ pub fn get_satd<T: Pixel>(
             T::to_asm_stride(src.plane_cfg.stride),
             dst.data_ptr() as *const _,
             T::to_asm_stride(dst.plane_cfg.stride),
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           )
         },
         None => call_rust(),
@@ -735,12 +735,12 @@ mod test {
   use rand::random;
   use std::str::FromStr;
 
-  macro_rules! test_dist_fns {
-    ($(($W:expr, $H:expr)),*, $DIST_TY:ident, $BD:expr, $OPT:ident, $OPTLIT:tt) => {
+  macro_rules! test_dist_sad_fns {
+    ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => {
       $(
         paste::item! {
           #[test]
-          fn [<get_ $DIST_TY _ $W x $H _bd_ $BD _ $OPT>]() {
+          fn [<get_sad _ $W x $H _bd_ $BD _ $OPT>]() {
             if !is_x86_feature_detected!($OPTLIT) {
               eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT);
               return;
@@ -755,8 +755,8 @@ mod test {
                 *s = random::<u8>() as u16 * $BD / 8;
                 *d = random::<u8>() as u16 * $BD / 8;
               }
-              let result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST);
+              let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             } else {
@@ -768,8 +768,8 @@ mod test {
                 *s = random::<u8>();
                 *d = random::<u8>();
               }
-              let result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-              let rust_result = [<get_ $DIST_TY>](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST);
+              let result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_sad(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
 
               assert_eq!(rust_result, result);
             }
@@ -779,7 +779,51 @@ mod test {
     }
   }
 
-  test_dist_fns!(
+  macro_rules! test_dist_satd_fns {
+    ($(($W:expr, $H:expr)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => {
+      $(
+        paste::item! {
+          #[test]
+          fn [<get_satd_ $W x $H _bd_ $BD _ $OPT>]() {
+            if !is_x86_feature_detected!($OPTLIT) {
+              eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT);
+              return;
+            }
+
+            if $BD > 8 {
+              // dynamic allocation: test
+              let mut src = Plane::from_slice(&[0u16; $W * $H], $W);
+              // dynamic allocation: test
+              let mut dst = Plane::from_slice(&[0u16; $W * $H], $W);
+              for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) {
+                *s = random::<u8>() as u16 * $BD / 8;
+                *d = random::<u8>() as u16 * $BD / 8;
+              }
+              let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+
+              assert_eq!(rust_result, result);
+            } else {
+              // dynamic allocation: test
+              let mut src = Plane::from_slice(&[0u8; $W * $H], $W);
+              // dynamic allocation: test
+              let mut dst = Plane::from_slice(&[0u8; $W * $H], $W);
+              for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) {
+                *s = random::<u8>();
+                *d = random::<u8>();
+              }
+              let result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+              let rust_result = get_satd::<_, $BD>(&src.as_region(), &dst.as_region(), $W, $H, CpuFeatureLevel::RUST);
+
+              assert_eq!(rust_result, result);
+            }
+          }
+        }
+      )*
+    }
+  }
+
+  test_dist_sad_fns!(
     (4, 4),
     (16, 16),
     (8, 8),
@@ -802,13 +846,12 @@ mod test {
     (128, 64),
     (16, 64),
     (64, 16),
-    sad,
-    10,
     ssse3,
-    "ssse3"
+    "ssse3",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (4, 4),
     (16, 16),
     (8, 8),
@@ -831,13 +874,12 @@ mod test {
     (128, 64),
     (16, 64),
     (64, 16),
-    sad,
-    10,
     avx2,
-    "avx2"
+    "avx2",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (4, 4),
     (4, 8),
     (4, 16),
@@ -849,13 +891,12 @@ mod test {
     (32, 32),
     (64, 64),
     (128, 128),
-    sad,
-    8,
     sse2,
-    "sse2"
+    "sse2",
+    8
   );
 
-  test_dist_fns!(
+  test_dist_sad_fns!(
     (16, 4),
     (16, 8),
     (16, 16),
@@ -871,17 +912,16 @@ mod test {
     (64, 128),
     (128, 64),
     (128, 128),
-    sad,
-    8,
     avx2,
-    "avx2"
+    "avx2",
+    8
   );
 
-  test_dist_fns!((8, 8), satd, 8, ssse3, "ssse3");
+  test_dist_satd_fns!((8, 8), ssse3, "ssse3", 8);
 
-  test_dist_fns!((4, 4), satd, 8, sse4, "sse4.1");
+  test_dist_satd_fns!((4, 4), sse4, "sse4.1", 8);
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -904,13 +944,12 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    8,
     avx2,
-    "avx2"
+    "avx2",
+    8
   );
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -933,13 +972,12 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    10,
     avx2,
-    "avx2"
+    "avx2",
+    10
   );
 
-  test_dist_fns!(
+  test_dist_satd_fns!(
     (4, 4),
     (8, 8),
     (16, 16),
@@ -962,9 +1000,8 @@ mod test {
     (32, 8),
     (16, 64),
     (64, 16),
-    satd,
-    12,
     avx2,
-    "avx2"
+    "avx2",
+    12
   );
 }
diff --git a/src/asm/x86/dist/sse.rs b/src/asm/x86/dist/sse.rs
index 08c710da11..dd4e6ef7de 100644
--- a/src/asm/x86/dist/sse.rs
+++ b/src/asm/x86/dist/sse.rs
@@ -92,8 +92,7 @@ declare_asm_hbd_sse_fn![
 #[allow(clippy::let_and_return)]
 pub fn get_weighted_sse<T: Pixel>(
   src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, scale: &[u32],
-  scale_stride: usize, w: usize, h: usize, bit_depth: usize,
-  cpu: CpuFeatureLevel,
+  scale_stride: usize, w: usize, h: usize, cpu: CpuFeatureLevel,
 ) -> u64 {
   // Assembly breaks if imp block size changes.
   assert_eq!(IMPORTANCE_BLOCK_SIZE >> 1, 4);
@@ -101,7 +100,7 @@ pub fn get_weighted_sse<T: Pixel>(
   let bsize_opt = BlockSize::from_width_and_height_opt(w, h);
 
   let call_rust = || -> u64 {
-    rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, bit_depth, cpu)
+    rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, cpu)
   };
 
   #[cfg(feature = "check_asm")]
@@ -381,7 +380,6 @@ pub mod test {
         SCALE_STRIDE,
         block.width(),
         block.height(),
-        bd,
         CpuFeatureLevel::default(),
       );
 
@@ -392,7 +390,6 @@ pub mod test {
         SCALE_STRIDE,
         block.width(),
         block.height(),
-        bd,
         CpuFeatureLevel::default(),
       );
 
diff --git a/src/asm/x86/mc.rs b/src/asm/x86/mc.rs
index 2f5e0b3b8f..19af81e54b 100644
--- a/src/asm/x86/mc.rs
+++ b/src/asm/x86/mc.rs
@@ -91,15 +91,14 @@ const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize {
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `src`
 #[inline(always)]
-pub fn put_8tap<T: Pixel>(
+pub fn put_8tap<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize,
   height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
-  mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel,
+  mode_y: FilterMode, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::put_8tap(
-      dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth,
-      cpu,
+    rust::put_8tap::<_, BD>(
+      dst, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu,
     );
   };
   #[cfg(feature = "check_asm")]
@@ -149,7 +148,7 @@ pub fn put_8tap<T: Pixel>(
             height as i32,
             col_frac,
             row_frac,
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           ),
           None => call_rust(dst),
         }
@@ -176,15 +175,14 @@ pub fn put_8tap<T: Pixel>(
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `src`
 #[inline(always)]
-pub fn prep_8tap<T: Pixel>(
+pub fn prep_8tap<T: Pixel, const BD: usize>(
   tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize,
   col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
-  bit_depth: usize, cpu: CpuFeatureLevel,
+  cpu: CpuFeatureLevel,
 ) {
   let call_rust = |tmp: &mut [i16]| {
-    rust::prep_8tap(
-      tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth,
-      cpu,
+    rust::prep_8tap::<_, BD>(
+      tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, cpu,
     );
   };
   #[cfg(feature = "check_asm")]
@@ -223,7 +221,7 @@ pub fn prep_8tap<T: Pixel>(
           None => call_rust(tmp),
         }
       }
-      PixelType::U16 if bit_depth > 8 => {
+      PixelType::U16 if BD > 8 => {
         match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] {
           Some(func) => (func)(
             tmp.as_mut_ptr() as *mut _,
@@ -233,7 +231,7 @@ pub fn prep_8tap<T: Pixel>(
             height as i32,
             col_frac,
             row_frac,
-            (1 << bit_depth) - 1,
+            (1 << BD) - 1,
           ),
           None => call_rust(tmp),
         }
@@ -253,12 +251,12 @@ pub fn prep_8tap<T: Pixel>(
 /// - If `width` is not between 2 and 128
 /// - If `width * height` is greater than the length of `tmp1` or `tmp2`
 /// - If `width` and `height` do not fit within the bounds of `dst`
-pub fn mc_avg<T: Pixel>(
+pub fn mc_avg<T: Pixel, const BD: usize>(
   dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize,
-  height: usize, bit_depth: usize, cpu: CpuFeatureLevel,
+  height: usize, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::mc_avg(dst, tmp1, tmp2, width, height, bit_depth, cpu);
+    rust::mc_avg::<_, BD>(dst, tmp1, tmp2, width, height, cpu);
   };
   #[cfg(feature = "check_asm")]
   let ref_dst = {
@@ -292,7 +290,7 @@ pub fn mc_avg<T: Pixel>(
         ),
         None => call_rust(dst),
       },
-      PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] {
+      PixelType::U16 if BD > 8 => match AVG_HBD_FNS[cpu.as_index()] {
         Some(func) => (func)(
           dst.data_ptr_mut() as *mut _,
           T::to_asm_stride(dst.plane_cfg.stride),
@@ -300,7 +298,7 @@ pub fn mc_avg<T: Pixel>(
           tmp2.as_ptr(),
           width as i32,
           height as i32,
-          (1 << bit_depth) - 1,
+          (1 << BD) - 1,
         ),
         None => call_rust(dst),
       },
@@ -652,8 +650,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
 
                 assert_eq!(&*dst1.data, &*dst2.data);
               }
@@ -670,8 +668,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::put_8tap::<_, 8>(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::put_8tap::<_, 8>(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
 
                 assert_eq!(&*dst1.data, &*dst2.data);
               }
@@ -740,8 +738,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
               }
             } else {
               // dynamic allocation: test
@@ -752,8 +750,8 @@ mod test {
 
               for mv in &test_mvs {
                 let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv);
-                super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap());
-                super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST);
+                super::prep_8tap::<_, 8>(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::from_str($OPTLIT).unwrap());
+                super::prep_8tap::<_, 8>(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, CpuFeatureLevel::RUST);
               }
             };
 
diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs
index e001ae98ef..67cec03afe 100644
--- a/src/asm/x86/predict.rs
+++ b/src/asm/x86/predict.rs
@@ -194,16 +194,15 @@ decl_cfl_pred_hbd_fn! {
 }
 
 #[inline(always)]
-pub fn dispatch_predict_intra<T: Pixel>(
+pub fn dispatch_predict_intra<T: Pixel, const BD: usize>(
   mode: PredictionMode, variant: PredictionVariant,
-  dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize,
-  ac: &[i16], angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
+  dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16], angle: isize,
+  ief_params: Option<IntraEdgeFilterParameters>,
   edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |dst: &mut PlaneRegionMut<'_, T>| {
-    rust::dispatch_predict_intra(
-      mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf,
-      cpu,
+    rust::dispatch_predict_intra::<T, BD>(
+      mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu,
     );
   };
 
@@ -362,11 +361,11 @@ pub fn dispatch_predict_intra<T: Pixel>(
           }
         }
       }
-      PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && bit_depth > 8 => {
+      PixelType::U16 if cpu >= CpuFeatureLevel::AVX2 && BD > 8 => {
         let dst_ptr = dst.data_ptr_mut() as *mut _;
         let edge_ptr =
           edge_buf.data.as_ptr().offset(2 * MAX_TX_SIZE as isize) as *const _;
-        let bd_max = (1 << bit_depth) - 1;
+        let bd_max = (1 << BD) - 1;
         match mode {
           PredictionMode::DC_PRED => {
             (match variant {
diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
index 28dbabedc7..3228902391 100644
--- a/src/asm/x86/quantize.rs
+++ b/src/asm/x86/quantize.rs
@@ -24,7 +24,6 @@ type DequantizeFn = unsafe fn(
   _eob: usize,
   rcoeffs_ptr: *mut i16,
   tx_size: TxSize,
-  bit_depth: usize,
   dc_delta_q: i8,
   ac_delta_q: i8,
 );
@@ -32,18 +31,17 @@ type DequantizeFn = unsafe fn(
 cpu_function_lookup_table!(
   DEQUANTIZE_FNS: [Option<DequantizeFn>],
   default: None,
-  [(AVX2, Some(dequantize_avx2))]
+  [(AVX2, Some(dequantize_avx2_8bpc))]
 );
 
 #[inline(always)]
-pub fn dequantize<T: Coefficient>(
+pub fn dequantize<T: Coefficient, const BD: usize>(
   qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
+  dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
 ) {
   let call_rust = |rcoeffs: &mut [T]| {
-    crate::quantize::rust::dequantize(
-      qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
-      ac_delta_q, cpu,
+    crate::quantize::rust::dequantize::<_, BD>(
+      qindex, coeffs, eob, rcoeffs, tx_size, dc_delta_q, ac_delta_q, cpu,
     );
   };
 
@@ -67,7 +65,6 @@ pub fn dequantize<T: Coefficient>(
             eob,
             rcoeffs.as_mut_ptr() as *mut _,
             tx_size,
-            bit_depth,
             dc_delta_q,
             ac_delta_q,
           )
@@ -87,18 +84,19 @@ pub fn dequantize<T: Coefficient>(
 }
 
 #[target_feature(enable = "avx2")]
-unsafe fn dequantize_avx2(
+unsafe fn dequantize_avx2_8bpc(
   qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
-  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+  tx_size: TxSize, dc_delta_q: i8, ac_delta_q: i8,
 ) {
+  const BD: usize = 8;
   let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32);
 
   let quants_ac =
-    _mm256_set1_epi32(ac_q(qindex, ac_delta_q, bit_depth).get() as i32);
+    _mm256_set1_epi32(ac_q::<BD>(qindex, ac_delta_q).get() as i32);
   // Use the dc quantize as first vector element for the first iteration
   let mut quants = _mm256_insert_epi32(
     quants_ac,
-    dc_q(qindex, dc_delta_q, bit_depth).get() as i32,
+    dc_q::<BD>(qindex, dc_delta_q).get() as i32,
     0,
   );
 
@@ -169,12 +167,10 @@ mod test {
       TX_8X32, TX_32X8, TX_16X64, TX_64X16,
     ];
 
-    let bd: usize = 8;
-
     for &tx_size in &tx_sizes {
       let qindex: u8 = rng.gen_range((MINQ as u8)..(MAXQ as u8));
-      let dc_quant = dc_q(qindex, 0, bd).get() as i16;
-      let ac_quant = ac_q(qindex, 0, bd).get() as i16;
+      let dc_quant = dc_q::<8>(qindex, 0).get() as i16;
+      let ac_quant = ac_q::<8>(qindex, 0).get() as i16;
 
       // Test the min, max, and random eobs
       let eobs = {
@@ -200,13 +196,12 @@ mod test {
         }
 
         // Rely on quantize's internal tests
-        dequantize(
+        dequantize::<_, 8>(
           qindex,
           &qcoeffs.data,
           eob,
           &mut rcoeffs.data,
           tx_size,
-          bd,
           0,
           0,
           CpuFeatureLevel::default(),
diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
index 18b1171517..d341915a0d 100644
--- a/src/asm/x86/transform/forward.rs
+++ b/src/asm/x86/transform/forward.rs
@@ -332,9 +332,9 @@ fn cast_mut<const N: usize, T>(x: &mut [T]) -> &mut [T; N] {
 
 #[allow(clippy::identity_op, clippy::erasing_op)]
 #[target_feature(enable = "avx2")]
-unsafe fn forward_transform_avx2<T: Coefficient>(
+unsafe fn forward_transform_avx2<T: Coefficient, const BD: usize>(
   input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize,
+  tx_type: TxType,
 ) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -350,7 +350,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 
   let mut tmp: Aligned<[I32X8; 64 * 64 / 8]> = Aligned::uninitialized();
   let buf = &mut tmp.data[..txfm_size_col * (txfm_size_row / 8).max(1)];
-  let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
+  let cfg = Txfm2DFlipCfg::fwd::<BD>(tx_type, tx_size);
 
   let txfm_func_col = get_func_i32x8(cfg.txfm_type_col);
   let txfm_func_row = get_func_i32x8(cfg.txfm_type_row);
@@ -507,18 +507,20 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 /// # Panics
 ///
 /// - If called with an invalid combination of `tx_size` and `tx_type`
-pub fn forward_transform<T: Coefficient>(
+pub fn forward_transform<T: Coefficient, const BD: usize>(
   input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  tx_type: TxType, cpu: CpuFeatureLevel,
 ) {
   assert!(valid_av1_transform(tx_size, tx_type));
   if cpu >= CpuFeatureLevel::AVX2 {
     // SAFETY: Calls Assembly code.
     unsafe {
-      forward_transform_avx2(input, output, stride, tx_size, tx_type, bd);
+      forward_transform_avx2::<_, BD>(input, output, stride, tx_size, tx_type);
     }
   } else {
-    rust::forward_transform(input, output, stride, tx_size, tx_type, bd, cpu);
+    rust::forward_transform::<_, BD>(
+      input, output, stride, tx_size, tx_type, cpu,
+    );
   }
 }
 
@@ -562,22 +564,20 @@ mod test {
         let mut output_simd = vec![0i16; area];
 
         println!("Testing combination {:?}, {:?}", tx_size, tx_type);
-        forward_transform(
+        forward_transform::<_, 8>(
           &input[..],
           &mut output_ref[..],
           tx_size.width(),
           tx_size,
           tx_type,
-          8,
           CpuFeatureLevel::RUST,
         );
-        forward_transform(
+        forward_transform::<_, 8>(
           &input[..],
           &mut output_simd[..],
           tx_size.width(),
           tx_size,
           tx_type,
-          8,
           cpu,
         );
         assert_eq!(output_ref, output_simd)
diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs
index 027cdf19b7..84b728d5e0 100644
--- a/src/asm/x86/transform/inverse.rs
+++ b/src/asm/x86/transform/inverse.rs
@@ -16,27 +16,26 @@ use crate::{Pixel, PixelType};
 use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
-pub fn inverse_transform_add<T: Pixel>(
+pub fn inverse_transform_add<T: Pixel, const BD: usize>(
   input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
-  tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  tx_size: TxSize, tx_type: TxType, cpu: CpuFeatureLevel,
 ) {
   match T::type_enum() {
     PixelType::U8 => {
       if let Some(func) = INV_TXFM_FNS[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
-        return call_inverse_func(
+        return call_inverse_func::<_, BD>(
           func,
           input,
           output,
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
-    PixelType::U16 if bd == 10 => {
+    PixelType::U16 if BD == 10 => {
       if let Some(func) = INV_TXFM_HBD_FNS_10[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
@@ -47,11 +46,10 @@ pub fn inverse_transform_add<T: Pixel>(
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
-    PixelType::U16 => {
+    PixelType::U16 if BD == 12 => {
       if let Some(func) = INV_TXFM_HBD_FNS_12[cpu.as_index()]
         [get_tx_size_idx(tx_size)][get_tx_type_idx(tx_type)]
       {
@@ -62,13 +60,15 @@ pub fn inverse_transform_add<T: Pixel>(
           eob,
           tx_size.width(),
           tx_size.height(),
-          bd,
         );
       }
     }
+    _ => unimplemented!(),
   };
 
-  rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
+  rust::inverse_transform_add::<_, BD>(
+    input, output, eob, tx_size, tx_type, cpu,
+  );
 }
 
 macro_rules! decl_itx_fns {
diff --git a/src/cdef.rs b/src/cdef.rs
index 863399036f..71fc1ae2d1 100644
--- a/src/cdef.rs
+++ b/src/cdef.rs
@@ -196,11 +196,14 @@ pub(crate) mod rust {
 
   #[cold_for_target_arch("x86_64")]
   #[allow(clippy::erasing_op, clippy::identity_op, clippy::neg_multiply)]
-  pub(crate) unsafe fn cdef_filter_block<T: Pixel, U: Pixel>(
+  pub(crate) unsafe fn cdef_filter_block<
+    T: Pixel,
+    U: Pixel,
+    const BD: usize,
+  >(
     dst: &mut PlaneRegionMut<'_, T>, input: *const U, istride: isize,
     pri_strength: i32, sec_strength: i32, dir: usize, damping: i32,
-    bit_depth: usize, xdec: usize, ydec: usize, edges: u8,
-    _cpu: CpuFeatureLevel,
+    xdec: usize, ydec: usize, edges: u8, _cpu: CpuFeatureLevel,
   ) {
     if edges != CDEF_HAVE_ALL {
       // slowpath for unpadded border[s]
@@ -216,7 +219,7 @@ pub(crate) mod rust {
         8 >> ydec,
         edges,
       );
-      cdef_filter_block(
+      cdef_filter_block::<_, _, BD>(
         dst,
         tmp.as_ptr().offset(2 * tmpstride + 2),
         tmpstride,
@@ -224,7 +227,6 @@ pub(crate) mod rust {
         sec_strength,
         dir,
         damping,
-        bit_depth,
         xdec,
         ydec,
         CDEF_HAVE_ALL,
@@ -233,7 +235,7 @@ pub(crate) mod rust {
     } else {
       let xsize = (8 >> xdec) as isize;
       let ysize = (8 >> ydec) as isize;
-      let coeff_shift = bit_depth - 8;
+      let coeff_shift = BD - 8;
       let cdef_pri_taps = [[4, 2], [3, 3]];
       let cdef_sec_taps = [[2, 1], [2, 1]];
       let pri_taps =
@@ -322,7 +324,7 @@ fn adjust_strength(strength: i32, var: i32) -> i32 {
   }
 }
 
-pub fn cdef_analyze_superblock_range<T: Pixel>(
+pub fn cdef_analyze_superblock_range<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, in_frame: &Frame<T>, blocks: &TileBlocks<'_>,
   sb_w: usize, sb_h: usize,
 ) -> Vec<CdefDirections> {
@@ -330,17 +332,17 @@ pub fn cdef_analyze_superblock_range<T: Pixel>(
   for sby in 0..sb_h {
     for sbx in 0..sb_w {
       let sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
-      ret.push(cdef_analyze_superblock(fi, in_frame, blocks, sbo));
+      ret.push(cdef_analyze_superblock::<_, BD>(fi, in_frame, blocks, sbo));
     }
   }
   ret
 }
 
-pub fn cdef_analyze_superblock<T: Pixel>(
+pub fn cdef_analyze_superblock<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, in_frame: &Frame<T>, blocks: &TileBlocks<'_>,
   sbo: TileSuperBlockOffset,
 ) -> CdefDirections {
-  let coeff_shift = fi.sequence.bit_depth - 8;
+  let coeff_shift = BD - 8;
   let mut dir: CdefDirections =
     CdefDirections { dir: [[0; 8]; 8], var: [[0; 8]; 8] };
   // Each direction block is 8x8 in y, and direction computation only looks at y
@@ -396,13 +398,12 @@ pub fn cdef_analyze_superblock<T: Pixel>(
 /// # Panics
 ///
 /// - If called with invalid parameters
-pub fn cdef_filter_superblock<T: Pixel>(
+pub fn cdef_filter_superblock<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, input: &Frame<T>, output: &mut TileMut<'_, T>,
   blocks: &TileBlocks<'_>, tile_sbo: TileSuperBlockOffset, cdef_index: u8,
   cdef_dirs: &CdefDirections,
 ) {
-  let bit_depth = fi.sequence.bit_depth;
-  let coeff_shift = fi.sequence.bit_depth as i32 - 8;
+  let coeff_shift = BD as i32 - 8;
   let cdef_damping = fi.cdef_damping as i32;
   let cdef_y_strength = fi.cdef_y_strengths[cdef_index as usize];
   let cdef_uv_strength = fi.cdef_uv_strengths[cdef_index as usize];
@@ -536,7 +537,7 @@ pub fn cdef_filter_superblock<T: Pixel>(
                 0 <= in_po.y - if edges & CDEF_HAVE_TOP > 0 { 2 } else { 0 }
               );
 
-              cdef_filter_block(
+              cdef_filter_block::<_, BD>(
                 out_block,
                 in_slice.as_ptr(),
                 in_stride as isize,
@@ -544,7 +545,6 @@ pub fn cdef_filter_superblock<T: Pixel>(
                 local_sec_strength,
                 local_dir,
                 local_damping,
-                bit_depth,
                 xdec,
                 ydec,
                 edges,
@@ -592,7 +592,7 @@ pub fn cdef_filter_superblock<T: Pixel>(
 //   don't exist.
 
 #[hawktracer(cdef_filter_tile)]
-pub fn cdef_filter_tile<T: Pixel>(
+pub fn cdef_filter_tile<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, input: &Frame<T>, tb: &TileBlocks,
   output: &mut TileMut<'_, T>,
 ) {
@@ -613,9 +613,10 @@ pub fn cdef_filter_tile<T: Pixel>(
       // the input Frame.
       let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby });
       let cdef_index = tb.get_cdef(tile_sbo);
-      let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo);
+      let cdef_dirs =
+        cdef_analyze_superblock::<_, BD>(fi, input, tb, tile_sbo);
 
-      cdef_filter_superblock(
+      cdef_filter_superblock::<_, BD>(
         fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs,
       );
     }
diff --git a/src/deblock.rs b/src/deblock.rs
index 21a4bf19aa..43afbfb468 100644
--- a/src/deblock.rs
+++ b/src/deblock.rs
@@ -146,9 +146,10 @@ fn deblock_level(
 
 // four taps, 4 outputs (two are trivial)
 #[inline]
-fn filter_narrow2_4(
-  p1: i32, p0: i32, q0: i32, q1: i32, shift: usize,
+fn filter_narrow2_4<const BD: usize>(
+  p1: i32, p0: i32, q0: i32, q1: i32,
 ) -> [i32; 4] {
+  let shift = BD - 8;
   let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1);
   let filter1 =
     clamp(filter0 + 3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3;
@@ -178,28 +179,29 @@ fn filter_narrow2_4(
 
 // six taps, 6 outputs (four are trivial)
 #[inline]
-fn filter_narrow2_6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn filter_narrow2_6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> [i32; 6] {
-  let x = filter_narrow2_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow2_4::<BD>(p1, p0, q0, q1);
   [p2, x[0], x[1], x[2], x[3], q2]
 }
 
 // 12 taps, 12 outputs (ten are trivial)
 #[inline]
-fn filter_narrow2_12(
+fn filter_narrow2_12<const BD: usize>(
   p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32,
-  q2: i32, q3: i32, q4: i32, q5: i32, shift: usize,
+  q2: i32, q3: i32, q4: i32, q5: i32,
 ) -> [i32; 12] {
-  let x = filter_narrow2_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow2_4::<BD>(p1, p0, q0, q1);
   [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5]
 }
 
 // four taps, 4 outputs
 #[inline]
-fn filter_narrow4_4(
-  p1: i32, p0: i32, q0: i32, q1: i32, shift: usize,
+fn filter_narrow4_4<const BD: usize>(
+  p1: i32, p0: i32, q0: i32, q1: i32,
 ) -> [i32; 4] {
+  let shift = BD - 8;
   let filter1 =
     clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3;
   // be certain our optimization removing a clamp is sound
@@ -227,20 +229,20 @@ fn filter_narrow4_4(
 
 // six taps, 6 outputs (two are trivial)
 #[inline]
-fn filter_narrow4_6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn filter_narrow4_6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> [i32; 6] {
-  let x = filter_narrow4_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow4_4::<BD>(p1, p0, q0, q1);
   [p2, x[0], x[1], x[2], x[3], q2]
 }
 
 // 12 taps, 12 outputs (eight are trivial)
 #[inline]
-fn filter_narrow4_12(
+fn filter_narrow4_12<const BD: usize>(
   p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32,
-  q2: i32, q3: i32, q4: i32, q5: i32, shift: usize,
+  q2: i32, q3: i32, q4: i32, q5: i32,
 ) -> [i32; 12] {
-  let x = filter_narrow4_4(p1, p0, q0, q1, shift);
+  let x = filter_narrow4_4::<BD>(p1, p0, q0, q1);
   [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5]
 }
 
@@ -333,57 +335,63 @@ fn stride_sse<const LEN: usize>(a: &[i32; LEN], b: &[i32; LEN]) -> i64 {
 }
 
 #[inline]
-const fn _level_to_limit(level: i32, shift: usize) -> i32 {
+const fn _level_to_limit<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   level << shift
 }
 
 #[inline]
-const fn limit_to_level(limit: i32, shift: usize) -> i32 {
+const fn limit_to_level<const BD: usize>(limit: i32) -> i32 {
+  let shift = BD - 8;
   (limit + (1 << shift) - 1) >> shift
 }
 
 #[inline]
-const fn _level_to_blimit(level: i32, shift: usize) -> i32 {
+const fn _level_to_blimit<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   (3 * level + 4) << shift
 }
 
 #[inline]
-const fn blimit_to_level(blimit: i32, shift: usize) -> i32 {
+const fn blimit_to_level<const BD: usize>(blimit: i32) -> i32 {
+  let shift = BD - 8;
   (((blimit + (1 << shift) - 1) >> shift) - 2) / 3
 }
 
 #[inline]
-const fn _level_to_thresh(level: i32, shift: usize) -> i32 {
+const fn _level_to_thresh<const BD: usize>(level: i32) -> i32 {
+  let shift = BD - 8;
   level >> 4 << shift
 }
 
 #[inline]
-const fn thresh_to_level(thresh: i32, shift: usize) -> i32 {
+const fn thresh_to_level<const BD: usize>(thresh: i32) -> i32 {
+  let shift = BD - 8;
   (thresh + (1 << shift) - 1) >> shift << 4
 }
 
 #[inline]
-fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize {
-  thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize
+fn nhev4<const BD: usize>(p1: i32, p0: i32, q0: i32, q1: i32) -> usize {
+  thresh_to_level::<BD>(cmp::max((p1 - p0).abs(), (q1 - q0).abs())) as usize
 }
 
 #[inline]
-fn mask4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize {
+fn mask4<const BD: usize>(p1: i32, p0: i32, q0: i32, q1: i32) -> usize {
   cmp::max(
-    limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    limit_to_level::<BD>(cmp::max((p1 - p0).abs(), (q1 - q0).abs())),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
 #[inline]
-fn deblock_size4_inner(
-  [p1, p0, q0, q1]: [i32; 4], level: usize, bd: usize,
+fn deblock_size4_inner<const BD: usize>(
+  [p1, p0, q0, q1]: [i32; 4], level: usize,
 ) -> Option<[i32; 4]> {
-  if mask4(p1, p0, q0, q1, bd - 8) <= level {
-    let x = if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_4(p1, p0, q0, q1, bd - 8)
+  if mask4::<BD>(p1, p0, q0, q1) <= level {
+    let x = if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_4::<BD>(p1, p0, q0, q1)
     } else {
-      filter_narrow2_4(p1, p0, q0, q1, bd - 8)
+      filter_narrow2_4::<BD>(p1, p0, q0, q1)
     };
     Some(x)
   } else {
@@ -392,26 +400,26 @@ fn deblock_size4_inner(
 }
 
 // Assumes rec[0] is set 2 taps back from the edge
-fn deblock_v_size4<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size4<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
     let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_()];
-    if let Some(data) = deblock_size4_inner(vals, level, bd) {
+    if let Some(data) = deblock_size4_inner::<BD>(vals, level) {
       copy_horizontal(rec, 0, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 2 taps back from the edge
-fn deblock_h_size4<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size4<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals =
       [rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_()];
-    if let Some(data) = deblock_size4_inner(vals, level, bd) {
+    if let Some(data) = deblock_size4_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 0, &data);
     }
   }
@@ -419,9 +427,9 @@ fn deblock_h_size4<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 2 taps back from the edge.
 // Accesses four taps, accumulates four pixels into the tally
-fn sse_size4<T: Pixel>(
+fn sse_size4<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
   for i in 0..4 {
     let (p1, p0, q0, q1, a) = if horizontal_p {
@@ -445,13 +453,13 @@ fn sse_size4<T: Pixel>(
     // three possibilities: no filter, narrow2 and narrow4
     // All possibilities produce four outputs
     let none: [_; 4] = [p1, p0, q0, q1];
-    let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8);
-    let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8);
+    let narrow2 = filter_narrow2_4::<BD>(p1, p0, q0, q1);
+    let narrow4 = filter_narrow4_4::<BD>(p1, p0, q0, q1);
 
     // mask4 sets the dividing line for filter vs no filter
     // nhev4 sets the dividing line between narrow2 and narrow4
-    let mask = clamp(mask4(p1, p0, q0, q1, bd - 8), 1, MAX_LOOP_FILTER + 1);
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let mask = clamp(mask4::<BD>(p1, p0, q0, q1), 1, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -474,18 +482,15 @@ fn sse_size4<T: Pixel>(
 }
 
 #[inline]
-fn mask6(
-  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize,
+fn mask6<const BD: usize>(
+  p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32,
 ) -> usize {
   cmp::max(
-    limit_to_level(
-      cmp::max(
-        (p2 - p1).abs(),
-        cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())),
-      ),
-      shift,
-    ),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    limit_to_level::<BD>(cmp::max(
+      (p2 - p1).abs(),
+      cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())),
+    )),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
@@ -498,17 +503,17 @@ fn flat6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> usize {
 }
 
 #[inline]
-fn deblock_size6_inner(
-  [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize, bd: usize,
+fn deblock_size6_inner<const BD: usize>(
+  [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize,
 ) -> Option<[i32; 4]> {
-  if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask6::<BD>(p2, p1, p0, q0, q1, q2) <= level {
+    let flat = 1 << (BD - 8);
     let x = if flat6(p2, p1, p0, q0, q1, q2) <= flat {
       filter_wide6_4(p2, p1, p0, q0, q1, q2)
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_4(p1, p0, q0, q1, bd - 8)
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_4::<BD>(p1, p0, q0, q1)
     } else {
-      filter_narrow2_4(p1, p0, q0, q1, bd - 8)
+      filter_narrow2_4::<BD>(p1, p0, q0, q1)
     };
     Some(x)
   } else {
@@ -517,22 +522,22 @@ fn deblock_size6_inner(
 }
 
 // Assumes slice[0] is set 3 taps back from the edge
-fn deblock_v_size6<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size6<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
     let vals =
       [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_()];
-    if let Some(data) = deblock_size6_inner(vals, level, bd) {
+    if let Some(data) = deblock_size6_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes slice[0] is set 3 taps back from the edge
-fn deblock_h_size6<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size6<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -543,7 +548,7 @@ fn deblock_h_size6<T: Pixel>(
       rec[4][x].as_(),
       rec[5][x].as_(),
     ];
-    if let Some(data) = deblock_size6_inner(vals, level, bd) {
+    if let Some(data) = deblock_size6_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -551,11 +556,11 @@ fn deblock_h_size6<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 3 taps back from the edge.
 // Accesses six taps, accumulates four pixels into the tally
-fn sse_size6<T: Pixel>(
+fn sse_size6<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
   for i in 0..4 {
     let (p2, p1, p0, q0, q1, q2, a) = if horizontal_p {
       // six taps
@@ -587,16 +592,16 @@ fn sse_size6<T: Pixel>(
     // All possibilities produce four outputs
     let none: [_; 4] = [p1, p0, q0, q1];
     let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2);
-    let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8);
-    let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8);
+    let narrow2 = filter_narrow2_4::<BD>(p1, p0, q0, q1);
+    let narrow4 = filter_narrow4_4::<BD>(p1, p0, q0, q1);
 
     // mask6 sets the dividing line for filter vs no filter
     // flat6 decides between wide and narrow filters (unrelated to level)
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask =
-      clamp(mask6(p2, p1, p0, q0, q1, q2, bd - 8), 1, MAX_LOOP_FILTER + 1);
+      clamp(mask6::<BD>(p2, p1, p0, q0, q1, q2), 1, MAX_LOOP_FILTER + 1);
     let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -627,28 +632,24 @@ fn sse_size6<T: Pixel>(
 }
 
 #[inline]
-fn mask8(
+fn mask8<const BD: usize>(
   p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32,
-  shift: usize,
 ) -> usize {
   cmp::max(
-    limit_to_level(
+    limit_to_level::<BD>(cmp::max(
+      (p3 - p2).abs(),
       cmp::max(
-        (p3 - p2).abs(),
+        (p2 - p1).abs(),
         cmp::max(
-          (p2 - p1).abs(),
+          (p1 - p0).abs(),
           cmp::max(
-            (p1 - p0).abs(),
-            cmp::max(
-              (q3 - q2).abs(),
-              cmp::max((q2 - q1).abs(), (q1 - q0).abs()),
-            ),
+            (q3 - q2).abs(),
+            cmp::max((q2 - q1).abs(), (q1 - q0).abs()),
           ),
         ),
       ),
-      shift,
-    ),
-    blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift),
+    )),
+    blimit_to_level::<BD>((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2),
   ) as usize
 }
 
@@ -669,17 +670,17 @@ fn flat8(
 }
 
 #[inline]
-fn deblock_size8_inner(
-  [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize, bd: usize,
+fn deblock_size8_inner<const BD: usize>(
+  [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize,
 ) -> Option<[i32; 6]> {
-  if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3) <= level {
+    let flat = 1 << (BD - 8);
     let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat {
       filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3)
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
-      filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8)
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
+      filter_narrow4_6::<BD>(p2, p1, p0, q0, q1, q2)
     } else {
-      filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8)
+      filter_narrow2_6::<BD>(p2, p1, p0, q0, q1, q2)
     };
     Some(x)
   } else {
@@ -688,8 +689,8 @@ fn deblock_size8_inner(
 }
 
 // Assumes rec[0] is set 4 taps back from the edge
-fn deblock_v_size8<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size8<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
@@ -703,15 +704,15 @@ fn deblock_v_size8<T: Pixel>(
       p[6].as_(),
       p[7].as_(),
     ];
-    if let Some(data) = deblock_size8_inner(vals, level, bd) {
+    if let Some(data) = deblock_size8_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 4 taps back from the edge
-fn deblock_h_size8<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size8<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -724,7 +725,7 @@ fn deblock_h_size8<T: Pixel>(
       rec[6][x].as_(),
       rec[7][x].as_(),
     ];
-    if let Some(data) = deblock_size8_inner(vals, level, bd) {
+    if let Some(data) = deblock_size8_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -732,11 +733,11 @@ fn deblock_h_size8<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 4 taps back from the edge.
 // Accesses eight taps, accumulates six pixels into the tally
-fn sse_size8<T: Pixel>(
+fn sse_size8<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
 
   for i in 0..4 {
     let (p3, p2, p1, p0, q0, q1, q2, q3, a) = if horizontal_p {
@@ -786,19 +787,19 @@ fn sse_size8<T: Pixel>(
     // Four possibilities: no filter, wide8, narrow2 and narrow4
     let none: [_; 6] = [p2, p1, p0, q0, q1, q2];
     let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3);
-    let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8);
-    let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8);
+    let narrow2: [_; 6] = filter_narrow2_6::<BD>(p2, p1, p0, q0, q1, q2);
+    let narrow4: [_; 6] = filter_narrow4_6::<BD>(p2, p1, p0, q0, q1, q2);
 
     // mask8 sets the dividing line for filter vs no filter
     // flat8 decides between wide and narrow filters (unrelated to level)
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask = clamp(
-      mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8),
+      mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3),
       1,
       MAX_LOOP_FILTER + 1,
     );
     let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -845,13 +846,13 @@ fn flat14_outer(
 }
 
 #[inline]
-fn deblock_size14_inner(
+fn deblock_size14_inner<const BD: usize>(
   [p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6]: [i32; 14],
-  level: usize, bd: usize,
+  level: usize,
 ) -> Option<[i32; 12]> {
   // 'mask' test
-  if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level {
-    let flat = 1 << (bd - 8);
+  if mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3) <= level {
+    let flat = 1 << (BD - 8);
     // inner flatness test
     let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat {
       // outer flatness test
@@ -864,11 +865,11 @@ fn deblock_size14_inner(
         // only flat in inner area, run 8-tap
         filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
       }
-    } else if nhev4(p1, p0, q0, q1, bd - 8) <= level {
+    } else if nhev4::<BD>(p1, p0, q0, q1) <= level {
       // not flat, run narrow filter
-      filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8)
+      filter_narrow4_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
     } else {
-      filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8)
+      filter_narrow2_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5)
     };
     Some(x)
   } else {
@@ -877,8 +878,8 @@ fn deblock_size14_inner(
 }
 
 // Assumes rec[0] is set 7 taps back from the edge
-fn deblock_v_size14<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_v_size14<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for y in 0..4 {
     let p = &rec[y];
@@ -898,15 +899,15 @@ fn deblock_v_size14<T: Pixel>(
       p[12].as_(),
       p[13].as_(),
     ];
-    if let Some(data) = deblock_size14_inner(vals, level, bd) {
+    if let Some(data) = deblock_size14_inner::<BD>(vals, level) {
       copy_horizontal(rec, 1, y, &data);
     }
   }
 }
 
 // Assumes rec[0] is set 7 taps back from the edge
-fn deblock_h_size14<T: Pixel>(
-  rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize,
+fn deblock_h_size14<T: Pixel, const BD: usize>(
+  rec: &mut PlaneRegionMut<'_, T>, level: usize,
 ) {
   for x in 0..4 {
     let vals = [
@@ -925,7 +926,7 @@ fn deblock_h_size14<T: Pixel>(
       rec[12][x].as_(),
       rec[13][x].as_(),
     ];
-    if let Some(data) = deblock_size14_inner(vals, level, bd) {
+    if let Some(data) = deblock_size14_inner::<BD>(vals, level) {
       copy_vertical(rec, x, 1, &data);
     }
   }
@@ -933,11 +934,11 @@ fn deblock_h_size14<T: Pixel>(
 
 // Assumes rec[0] and src[0] are set 7 taps back from the edge.
 // Accesses fourteen taps, accumulates twelve pixels into the tally
-fn sse_size14<T: Pixel>(
+fn sse_size14<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>,
-  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize,
+  tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool,
 ) {
-  let flat = 1 << (bd - 8);
+  let flat = 1 << (BD - 8);
   for i in 0..4 {
     let (p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, a) =
       if horizontal_p {
@@ -1014,49 +1015,23 @@ fn sse_size14<T: Pixel>(
       filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6);
     let wide8 =
       filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
-    let narrow2 = filter_narrow2_12(
-      p5,
-      p4,
-      p3,
-      p2,
-      p1,
-      p0,
-      q0,
-      q1,
-      q2,
-      q3,
-      q4,
-      q5,
-      bd - 8,
-    );
-    let narrow4 = filter_narrow4_12(
-      p5,
-      p4,
-      p3,
-      p2,
-      p1,
-      p0,
-      q0,
-      q1,
-      q2,
-      q3,
-      q4,
-      q5,
-      bd - 8,
-    );
+    let narrow2 =
+      filter_narrow2_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
+    let narrow4 =
+      filter_narrow4_12::<BD>(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5);
 
     // mask8 sets the dividing line for filter vs no filter
     // flat8 decides between wide and narrow filters (unrelated to level)
     // flat14 decides between wide14 and wide8 filters
     // nhev4 sets the dividing line between narrow2 and narrow4
     let mask = clamp(
-      mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8),
+      mask8::<BD>(p3, p2, p1, p0, q0, q1, q2, q3),
       1,
       MAX_LOOP_FILTER + 1,
     );
     let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat;
     let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat;
-    let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1);
+    let nhev = clamp(nhev4::<BD>(p1, p0, q0, q1), mask, MAX_LOOP_FILTER + 1);
 
     // sse for each; short-circuit the 'special' no-op cases.
     let sse_none = stride_sse(&a, &none);
@@ -1098,9 +1073,9 @@ fn sse_size14<T: Pixel>(
   }
 }
 
-fn filter_v_edge<T: Pixel>(
+fn filter_v_edge<T: Pixel, const BD: usize>(
   deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset,
-  p: &mut PlaneRegionMut<T>, pli: usize, bd: usize, xdec: usize, ydec: usize,
+  p: &mut PlaneRegionMut<T>, pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1126,16 +1101,16 @@ fn filter_v_edge<T: Pixel>(
         });
         match filter_size {
           4 => {
-            deblock_v_size4(&mut plane_region, level, bd);
+            deblock_v_size4::<_, BD>(&mut plane_region, level);
           }
           6 => {
-            deblock_v_size6(&mut plane_region, level, bd);
+            deblock_v_size6::<_, BD>(&mut plane_region, level);
           }
           8 => {
-            deblock_v_size8(&mut plane_region, level, bd);
+            deblock_v_size8::<_, BD>(&mut plane_region, level);
           }
           14 => {
-            deblock_v_size14(&mut plane_region, level, bd);
+            deblock_v_size14::<_, BD>(&mut plane_region, level);
           }
           _ => unreachable!(),
         }
@@ -1144,10 +1119,10 @@ fn filter_v_edge<T: Pixel>(
   }
 }
 
-fn sse_v_edge<T: Pixel>(
+fn sse_v_edge<T: Pixel, const BD: usize>(
   blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion<T>,
   src_plane: &PlaneRegion<T>, tally: &mut [i64; MAX_LOOP_FILTER + 2],
-  pli: usize, bd: usize, xdec: usize, ydec: usize,
+  pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1177,16 +1152,16 @@ fn sse_v_edge<T: Pixel>(
       });
       match filter_size {
         4 => {
-          sse_size4(&rec_region, &src_region, tally, false, bd);
+          sse_size4::<_, BD>(&rec_region, &src_region, tally, false);
         }
         6 => {
-          sse_size6(&rec_region, &src_region, tally, false, bd);
+          sse_size6::<_, BD>(&rec_region, &src_region, tally, false);
         }
         8 => {
-          sse_size8(&rec_region, &src_region, tally, false, bd);
+          sse_size8::<_, BD>(&rec_region, &src_region, tally, false);
         }
         14 => {
-          sse_size14(&rec_region, &src_region, tally, false, bd);
+          sse_size14::<_, BD>(&rec_region, &src_region, tally, false);
         }
         _ => unreachable!(),
       }
@@ -1194,9 +1169,9 @@ fn sse_v_edge<T: Pixel>(
   }
 }
 
-fn filter_h_edge<T: Pixel>(
+fn filter_h_edge<T: Pixel, const BD: usize>(
   deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset,
-  p: &mut PlaneRegionMut<T>, pli: usize, bd: usize, xdec: usize, ydec: usize,
+  p: &mut PlaneRegionMut<T>, pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1222,16 +1197,16 @@ fn filter_h_edge<T: Pixel>(
         });
         match filter_size {
           4 => {
-            deblock_h_size4(&mut plane_region, level, bd);
+            deblock_h_size4::<_, BD>(&mut plane_region, level);
           }
           6 => {
-            deblock_h_size6(&mut plane_region, level, bd);
+            deblock_h_size6::<_, BD>(&mut plane_region, level);
           }
           8 => {
-            deblock_h_size8(&mut plane_region, level, bd);
+            deblock_h_size8::<_, BD>(&mut plane_region, level);
           }
           14 => {
-            deblock_h_size14(&mut plane_region, level, bd);
+            deblock_h_size14::<_, BD>(&mut plane_region, level);
           }
           _ => unreachable!(),
         }
@@ -1240,10 +1215,10 @@ fn filter_h_edge<T: Pixel>(
   }
 }
 
-fn sse_h_edge<T: Pixel>(
+fn sse_h_edge<T: Pixel, const BD: usize>(
   blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion<T>,
   src_plane: &PlaneRegion<T>, tally: &mut [i64; MAX_LOOP_FILTER + 2],
-  pli: usize, bd: usize, xdec: usize, ydec: usize,
+  pli: usize, xdec: usize, ydec: usize,
 ) {
   let block = &blocks[bo];
   let txsize = if pli == 0 {
@@ -1274,16 +1249,16 @@ fn sse_h_edge<T: Pixel>(
 
       match filter_size {
         4 => {
-          sse_size4(&rec_region, &src_region, tally, true, bd);
+          sse_size4::<_, BD>(&rec_region, &src_region, tally, true);
         }
         6 => {
-          sse_size6(&rec_region, &src_region, tally, true, bd);
+          sse_size6::<_, BD>(&rec_region, &src_region, tally, true);
         }
         8 => {
-          sse_size8(&rec_region, &src_region, tally, true, bd);
+          sse_size8::<_, BD>(&rec_region, &src_region, tally, true);
         }
         14 => {
-          sse_size14(&rec_region, &src_region, tally, true, bd);
+          sse_size14::<_, BD>(&rec_region, &src_region, tally, true);
         }
         _ => unreachable!(),
       }
@@ -1293,9 +1268,9 @@ fn sse_h_edge<T: Pixel>(
 
 // Deblocks all edges, vertical and horizontal, in a single plane
 #[hawktracer(deblock_plane)]
-pub fn deblock_plane<T: Pixel>(
+pub fn deblock_plane<T: Pixel, const BD: usize>(
   deblock: &DeblockState, p: &mut PlaneRegionMut<T>, pli: usize,
-  blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize,
+  blocks: &TileBlocks, crop_w: usize, crop_h: usize,
 ) {
   let xdec = p.plane_cfg.xdec;
   let ydec = p.plane_cfg.ydec;
@@ -1339,26 +1314,24 @@ pub fn deblock_plane<T: Pixel>(
   // edge).  Unroll to avoid corner-cases.
   if rows > 0 {
     for x in (1 << xdec..cols).step_by(1 << xdec) {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y: 0 }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     if rows > 1 << ydec {
       for x in (1 << xdec..cols).step_by(1 << xdec) {
-        filter_v_edge(
+        filter_v_edge::<_, BD>(
           deblock,
           blocks,
           TileBlockOffset(BlockOffset { x, y: 1 << ydec }),
           p,
           pli,
-          bd,
           xdec,
           ydec,
         );
@@ -1371,13 +1344,12 @@ pub fn deblock_plane<T: Pixel>(
   for y in ((2 << ydec)..rows).step_by(1 << ydec) {
     // Check for vertical edge at first MI block boundary on this row
     if cols > 1 << xdec {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x: 1 << xdec, y }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1385,17 +1357,16 @@ pub fn deblock_plane<T: Pixel>(
     // run the rest of the row with both vertical and horizontal edge filtering.
     // Horizontal lags vertical edge by one row and two columns.
     for x in (2 << xdec..cols).step_by(1 << xdec) {
-      filter_v_edge(
+      filter_v_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1404,14 +1375,13 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     // ..and the last two horizontal edges for the row
     if cols >= 2 << xdec {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1420,13 +1390,12 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
     }
     if cols >= 1 << xdec {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset {
@@ -1435,7 +1404,6 @@ pub fn deblock_plane<T: Pixel>(
         }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1445,13 +1413,12 @@ pub fn deblock_plane<T: Pixel>(
   // Last horizontal row, vertical is already complete
   if rows > 1 << ydec {
     for x in (0..cols).step_by(1 << xdec) {
-      filter_h_edge(
+      filter_h_edge::<_, BD>(
         deblock,
         blocks,
         TileBlockOffset(BlockOffset { x, y: rows - (1 << ydec) }),
         p,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1460,11 +1427,11 @@ pub fn deblock_plane<T: Pixel>(
 }
 
 // sse count of all edges in a single plane, accumulates into vertical and horizontal counts
-fn sse_plane<T: Pixel>(
+fn sse_plane<T: Pixel, const BD: usize>(
   rec: &PlaneRegion<T>, src: &PlaneRegion<T>,
   v_sse: &mut [i64; MAX_LOOP_FILTER + 2],
   h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, blocks: &TileBlocks,
-  crop_w: usize, crop_h: usize, bd: usize,
+  crop_w: usize, crop_h: usize,
 ) {
   let xdec = rec.plane_cfg.xdec;
   let ydec = rec.plane_cfg.ydec;
@@ -1485,14 +1452,13 @@ fn sse_plane<T: Pixel>(
 
   // No horizontal edge filtering along top of frame
   for x in (1 << xdec..cols).step_by(1 << xdec) {
-    sse_v_edge(
+    sse_v_edge::<_, BD>(
       blocks,
       TileBlockOffset(BlockOffset { x, y: 0 }),
       rec,
       src,
       v_sse,
       pli,
-      bd,
       xdec,
       ydec,
     );
@@ -1503,37 +1469,34 @@ fn sse_plane<T: Pixel>(
   // behind vertical.
   for y in (1 << ydec..rows).step_by(1 << ydec) {
     // No vertical filtering along left edge of frame
-    sse_h_edge(
+    sse_h_edge::<_, BD>(
       blocks,
       TileBlockOffset(BlockOffset { x: 0, y }),
       rec,
       src,
       h_sse,
       pli,
-      bd,
       xdec,
       ydec,
     );
     for x in (1 << xdec..cols).step_by(1 << xdec) {
-      sse_v_edge(
+      sse_v_edge::<_, BD>(
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         rec,
         src,
         v_sse,
         pli,
-        bd,
         xdec,
         ydec,
       );
-      sse_h_edge(
+      sse_h_edge::<_, BD>(
         blocks,
         TileBlockOffset(BlockOffset { x, y }),
         rec,
         src,
         h_sse,
         pli,
-        bd,
         xdec,
         ydec,
       );
@@ -1543,18 +1506,18 @@ fn sse_plane<T: Pixel>(
 
 // Deblocks all edges in all planes of a frame
 #[hawktracer(deblock_filter_frame)]
-pub fn deblock_filter_frame<T: Pixel>(
+pub fn deblock_filter_frame<T: Pixel, const BD: usize>(
   deblock: &DeblockState, tile: &mut TileMut<T>, blocks: &TileBlocks,
-  crop_w: usize, crop_h: usize, bd: usize, planes: usize,
+  crop_w: usize, crop_h: usize, planes: usize,
 ) {
   tile.planes[..planes].par_iter_mut().enumerate().for_each(|(pli, plane)| {
-    deblock_plane(deblock, plane, pli, blocks, crop_w, crop_h, bd);
+    deblock_plane::<_, BD>(deblock, plane, pli, blocks, crop_w, crop_h);
   });
 }
 
-fn sse_optimize<T: Pixel>(
+fn sse_optimize<T: Pixel, const BD: usize>(
   rec: &Tile<T>, input: &Tile<T>, blocks: &TileBlocks, crop_w: usize,
-  crop_h: usize, bd: usize, monochrome: bool,
+  crop_h: usize, monochrome: bool,
 ) -> [u8; 4] {
   // i64 allows us to accumulate a total of ~ 35 bits worth of pixels
   assert!(
@@ -1569,7 +1532,7 @@ fn sse_optimize<T: Pixel>(
     let mut v_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2];
     let mut h_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2];
 
-    sse_plane(
+    sse_plane::<_, BD>(
       &rec.planes[pli],
       &input.planes[pli],
       &mut v_tally,
@@ -1578,7 +1541,6 @@ fn sse_optimize<T: Pixel>(
       blocks,
       crop_w,
       crop_h,
-      bd,
     );
 
     for i in 1..=MAX_LOOP_FILTER {
@@ -1619,14 +1581,14 @@ fn sse_optimize<T: Pixel>(
 }
 
 #[hawktracer(deblock_filter_optimize)]
-pub fn deblock_filter_optimize<T: Pixel, U: Pixel>(
+pub fn deblock_filter_optimize<T: Pixel, U: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, rec: &Tile<U>, input: &Tile<U>,
   blocks: &TileBlocks, crop_w: usize, crop_h: usize,
 ) -> [u8; 4] {
   if fi.config.speed_settings.fast_deblock {
-    let q = ac_q(fi.base_q_idx, 0, fi.sequence.bit_depth).get() as i32;
+    let q = ac_q::<BD>(fi.base_q_idx, 0).get() as i32;
     let level = clamp(
-      match fi.sequence.bit_depth {
+      match BD {
         8 => {
           if fi.frame_type == FrameType::KEY {
             (q * 17563 - 421_574 + (1 << 18 >> 1)) >> 18
@@ -1657,13 +1619,12 @@ pub fn deblock_filter_optimize<T: Pixel, U: Pixel>(
   } else {
     // Deblocking happens in 4x4 (luma) units; luma x,y are clipped to
     // the *crop frame* of the entire frame by 4x4 block.
-    sse_optimize(
+    sse_optimize::<_, BD>(
       rec,
       input,
       blocks,
       crop_w,
       crop_h,
-      fi.sequence.bit_depth,
       fi.sequence.chroma_sampling == Cs400,
     )
   }
diff --git a/src/dist.rs b/src/dist.rs
index 4b5536a841..aaf2e3e289 100644
--- a/src/dist.rs
+++ b/src/dist.rs
@@ -32,7 +32,7 @@ pub(crate) mod rust {
   /// w and h can be at most 128, the size of the largest block.
   pub fn get_sad<T: Pixel>(
     plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
-    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
+    h: usize, _cpu: CpuFeatureLevel,
   ) -> u32 {
     debug_assert!(w <= 128 && h <= 128);
     let plane_org =
@@ -157,7 +157,7 @@ pub(crate) mod rust {
   /// 4x4 transforms instead of 8x8 transforms when width or height < 8.
   pub fn get_satd<T: Pixel>(
     plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
-    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
+    h: usize, _cpu: CpuFeatureLevel,
   ) -> u32 {
     assert!(w <= 128 && h <= 128);
     assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
@@ -186,9 +186,8 @@ pub(crate) mod rust {
 
         // Revert to sad on edge blocks (frame edges)
         if chunk_w != size || chunk_h != size {
-          sum += get_sad(
-            &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
-          ) as u64;
+          sum +=
+            get_sad(&chunk_org, &chunk_ref, chunk_w, chunk_h, _cpu) as u64;
           continue;
         }
 
@@ -235,8 +234,7 @@ pub(crate) mod rust {
   #[inline(never)]
   pub fn get_weighted_sse<T: Pixel>(
     src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
-    scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
-    _cpu: CpuFeatureLevel,
+    scale_stride: usize, w: usize, h: usize, _cpu: CpuFeatureLevel,
   ) -> u64 {
     let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
     // Always chunk and apply scaling on the sse of squares the size of
@@ -301,9 +299,9 @@ pub(crate) mod rust {
   /// Computes a distortion metric of the sum of squares weighted by activity.
   /// w and h should be <= 8.
   #[inline(never)]
-  pub fn cdef_dist_kernel<T: Pixel>(
+  pub fn cdef_dist_kernel<T: Pixel, const BD: usize>(
     src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
-    bit_depth: usize, _cpu: CpuFeatureLevel,
+    _cpu: CpuFeatureLevel,
   ) -> u32 {
     // TODO: Investigate using different constants in ssim boost for block sizes
     // smaller than 8x8.
@@ -370,7 +368,7 @@ pub(crate) mod rust {
     dvar =
       ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
 
-    apply_ssim_boost(sse, svar, dvar, bit_depth)
+    apply_ssim_boost::<BD>(sse, svar, dvar)
   }
 }
 
@@ -442,7 +440,6 @@ pub mod test {
       (64, 16, 116384),
     ];
 
-    let bit_depth: usize = 8;
     let (input_plane, rec_plane) = setup_planes::<T>();
 
     for (w, h, distortion) in blocks {
@@ -453,14 +450,7 @@ pub mod test {
 
       assert_eq!(
         distortion,
-        get_sad(
-          &input_region,
-          &rec_region,
-          w,
-          h,
-          bit_depth,
-          CpuFeatureLevel::default()
-        )
+        get_sad(&input_region, &rec_region, w, h, CpuFeatureLevel::default())
       );
     }
   }
@@ -475,7 +465,7 @@ pub mod test {
     get_sad_same_inner::<u16>();
   }
 
-  fn get_satd_same_inner<T: Pixel>() {
+  fn get_satd_same_inner<T: Pixel, const BD: usize>() {
     let blocks: Vec<(usize, usize, u32)> = vec![
       (4, 4, 1408),
       (4, 8, 2016),
@@ -501,7 +491,6 @@ pub mod test {
       (64, 16, 21312),
     ];
 
-    let bit_depth: usize = 8;
     let (input_plane, rec_plane) = setup_planes::<T>();
 
     for (w, h, distortion) in blocks {
@@ -512,12 +501,11 @@ pub mod test {
 
       assert_eq!(
         distortion,
-        get_satd(
+        get_satd::<_, BD>(
           &input_region,
           &rec_region,
           w,
           h,
-          bit_depth,
           CpuFeatureLevel::default()
         )
       );
@@ -526,11 +514,11 @@ pub mod test {
 
   #[test]
   fn get_satd_same_u8() {
-    get_satd_same_inner::<u8>();
+    get_satd_same_inner::<u8, 8>();
   }
 
   #[test]
   fn get_satd_same_u16() {
-    get_satd_same_inner::<u16>();
+    get_satd_same_inner::<u16, 10>();
   }
 }
diff --git a/src/encoder.rs b/src/encoder.rs
index 2b8d2ee80e..c2b45ee37f 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -570,12 +570,12 @@ pub struct SegmentationState {
 }
 
 impl SegmentationState {
-  pub fn update_threshold(&mut self, base_q_idx: u8, bd: usize) {
-    let base_ac_q = ac_q(base_q_idx, 0, bd).get() as u64;
+  pub fn update_threshold<const BD: usize>(&mut self, base_q_idx: u8) {
+    let base_ac_q = ac_q::<BD>(base_q_idx, 0).get() as u64;
     let real_ac_q = ArrayVec::<_, MAX_SEGMENTS>::from_iter(
       self.data[..=self.max_segment as usize].iter().map(|data| {
-        ac_q(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8, bd).get()
-          as u64
+        ac_q::<BD>(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8)
+          .get() as u64
       }),
     );
     self.threshold.fill(DistortionScale(0));
@@ -1246,15 +1246,16 @@ impl<T: Pixel> FrameInvariants<T> {
       (uv_f1 * CDEF_SEC_STRENGTHS as i32 + uv_f2) as u8;
   }
 
-  pub fn set_quantizers(&mut self, qps: &QuantizerParameters) {
+  pub fn set_quantizers<const BD: usize>(
+    &mut self, qps: &QuantizerParameters,
+  ) {
     self.base_q_idx = qps.ac_qi[0];
     let base_q_idx = self.base_q_idx as i32;
     for pi in 0..3 {
       self.dc_delta_q[pi] = (qps.dc_qi[pi] as i32 - base_q_idx) as i8;
       self.ac_delta_q[pi] = (qps.ac_qi[pi] as i32 - base_q_idx) as i8;
     }
-    self.lambda =
-      qps.lambda * ((1 << (2 * (self.sequence.bit_depth - 8))) as f64);
+    self.lambda = qps.lambda * ((1 << (2 * (BD - 8))) as f64);
     self.me_lambda = self.lambda.sqrt();
     self.dist_scale = qps.dist_scale.map(DistortionScale::from);
 
@@ -1394,7 +1395,7 @@ fn get_qidx<T: Pixel>(
 ///
 /// - If the block size is invalid for subsampling
 /// - If a tx type other than DCT is used for 64x64 blocks
-pub fn encode_tx_block<T: Pixel, W: Writer>(
+pub fn encode_tx_block<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>,
   ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter,
@@ -1465,8 +1466,7 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   let rec = &mut ts.rec.planes[p];
 
   if mode.is_intra() {
-    let bit_depth = fi.sequence.bit_depth;
-    let edge_buf = get_intra_edges(
+    let edge_buf = get_intra_edges::<_, BD>(
       &rec.as_const(),
       tile_partition_bo,
       bx,
@@ -1474,17 +1474,15 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
       bsize,
       po,
       tx_size,
-      bit_depth,
       Some(mode),
       fi.sequence.enable_intra_edge_filter,
       pred_intra_param,
     );
 
-    mode.predict_intra(
+    mode.predict_intra::<_, BD>(
       tile_rect,
       &mut rec.subregion_mut(area),
       tx_size,
-      bit_depth,
       ac,
       pred_intra_param,
       ief_params,
@@ -1536,13 +1534,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     residual.fill(0);
   }
 
-  forward_transform(
+  forward_transform::<_, BD>(
     residual,
     coeffs,
     tx_size.width(),
     tx_size,
     tx_type,
-    fi.sequence.bit_depth,
     fi.cpu_feature_level,
   );
 
@@ -1579,13 +1576,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   };
 
   // Reconstruct
-  dequantize(
+  dequantize::<_, BD>(
     qidx,
     qcoeffs,
     eob,
     rcoeffs,
     tx_size,
-    fi.sequence.bit_depth,
     fi.dc_delta_q[p],
     fi.ac_delta_q[p],
     fi.cpu_feature_level,
@@ -1594,13 +1590,12 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   if eob == 0 {
     // All zero coefficients is a no-op
   } else if !fi.use_tx_domain_distortion || need_recon_pixel {
-    inverse_transform_add(
+    inverse_transform_add::<_, BD>(
       rcoeffs,
       &mut rec.subregion_mut(area),
       eob,
       tx_size,
       tx_type,
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     );
   }
@@ -1654,7 +1649,7 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
 /// # Panics
 ///
 /// - If the block size is invalid for subsampling
-pub fn motion_compensate<T: Pixel>(
+pub fn motion_compensate<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, luma_mode: PredictionMode, ref_frames: [RefType; 2],
   mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -1713,7 +1708,7 @@ pub fn motion_compensate<T: Pixel>(
       };
 
       if some_use_intra {
-        luma_mode.predict_inter(
+        luma_mode.predict_inter::<_, BD>(
           fi,
           tile_rect,
           p,
@@ -1741,7 +1736,7 @@ pub fn motion_compensate<T: Pixel>(
           let area2 = Area::StartingAt { x: po2.x, y: po2.y };
           let po3 = PlaneOffset { x: po.x + 2, y: po.y + 2 };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1753,7 +1748,7 @@ pub fn motion_compensate<T: Pixel>(
             mv0,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1765,7 +1760,7 @@ pub fn motion_compensate<T: Pixel>(
             mv1,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1777,7 +1772,7 @@ pub fn motion_compensate<T: Pixel>(
             mv2,
             compound_buffer,
           );
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1793,7 +1788,7 @@ pub fn motion_compensate<T: Pixel>(
         if bsize == BlockSize::BLOCK_8X4 {
           let mv1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].mv;
           let rf1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].ref_frames;
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1807,7 +1802,7 @@ pub fn motion_compensate<T: Pixel>(
           );
           let po3 = PlaneOffset { x: po.x, y: po.y + 2 };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1823,7 +1818,7 @@ pub fn motion_compensate<T: Pixel>(
         if bsize == BlockSize::BLOCK_4X8 {
           let mv2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].mv;
           let rf2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].ref_frames;
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1837,7 +1832,7 @@ pub fn motion_compensate<T: Pixel>(
           );
           let po3 = PlaneOffset { x: po.x + 2, y: po.y };
           let area3 = Area::StartingAt { x: po3.x, y: po3.y };
-          luma_mode.predict_inter(
+          luma_mode.predict_inter::<_, BD>(
             fi,
             tile_rect,
             p,
@@ -1852,7 +1847,7 @@ pub fn motion_compensate<T: Pixel>(
         }
       }
     } else {
-      luma_mode.predict_inter(
+      luma_mode.predict_inter::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -1922,7 +1917,7 @@ pub fn encode_block_pre_cdef<T: Pixel, W: Writer>(
 ///
 /// - If chroma and luma do not match for inter modes
 /// - If an invalid motion vector is found
-pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
+pub fn encode_block_post_cdef<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   chroma_mode: PredictionMode, angle_delta: AngleDelta,
@@ -2184,10 +2179,10 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
   }
 
   if is_inter {
-    motion_compensate(
+    motion_compensate::<_, BD>(
       fi, ts, cw, luma_mode, ref_frames, mvs, bsize, tile_bo, false,
     );
-    write_tx_tree(
+    write_tx_tree::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2204,7 +2199,7 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
       need_recon_pixel,
     )
   } else {
-    write_tx_blocks(
+    write_tx_blocks::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2228,7 +2223,7 @@ pub fn encode_block_post_cdef<T: Pixel, W: Writer>(
 /// # Panics
 ///
 /// - If attempting to encode a lossless block (not yet supported)
-pub fn write_tx_blocks<T: Pixel, W: Writer>(
+pub fn write_tx_blocks<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   chroma_mode: PredictionMode, angle_delta: AngleDelta,
@@ -2249,14 +2244,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   let do_chroma =
     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
 
-  ts.qc.update(
-    qidx,
-    tx_size,
-    luma_mode.is_intra(),
-    fi.sequence.bit_depth,
-    fi.dc_delta_q[0],
-    0,
-  );
+  ts.qc.update::<BD>(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0);
 
   for by in 0..bh {
     for bx in 0..bw {
@@ -2268,7 +2256,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
         continue;
       }
       let po = tx_bo.plane_offset(&ts.input.planes[0].cfg);
-      let (has_coeff, dist) = encode_tx_block(
+      let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2333,11 +2321,10 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   };
 
   for p in 1..3 {
-    ts.qc.update(
+    ts.qc.update::<BD>(
       qidx,
       uv_tx_size,
       true,
-      fi.sequence.bit_depth,
       fi.dc_delta_q[p],
       fi.ac_delta_q[p],
     );
@@ -2354,7 +2341,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
         let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg);
         po.x += (bx * uv_tx_size.width()) as isize;
         po.y += (by * uv_tx_size.height()) as isize;
-        let (has_coeff, dist) = encode_tx_block(
+        let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2389,7 +2376,7 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   (partition_has_coeff, tx_dist)
 }
 
-pub fn write_tx_tree<T: Pixel, W: Writer>(
+pub fn write_tx_tree<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode,
   angle_delta_y: i8, tile_bo: TileBlockOffset, bsize: BlockSize,
@@ -2408,14 +2395,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   let mut partition_has_coeff: bool = false;
   let mut tx_dist = ScaledDistortion::zero();
 
-  ts.qc.update(
-    qidx,
-    tx_size,
-    luma_mode.is_intra(),
-    fi.sequence.bit_depth,
-    fi.dc_delta_q[0],
-    0,
-  );
+  ts.qc.update::<BD>(qidx, tx_size, luma_mode.is_intra(), fi.dc_delta_q[0], 0);
 
   // TODO: If tx-parition more than only 1-level, this code does not work.
   // It should recursively traverse the tx block that are split recursivelty by calling write_tx_tree(),
@@ -2431,7 +2411,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
       }
 
       let po = tx_bo.plane_offset(&ts.input.planes[0].cfg);
-      let (has_coeff, dist) = encode_tx_block(
+      let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2494,11 +2474,10 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   };
 
   for p in 1..3 {
-    ts.qc.update(
+    ts.qc.update::<BD>(
       qidx,
       uv_tx_size,
       false,
-      fi.sequence.bit_depth,
       fi.dc_delta_q[p],
       fi.ac_delta_q[p],
     );
@@ -2515,7 +2494,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
         let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg);
         po.x += (bx * uv_tx_size.width()) as isize;
         po.y += (by * uv_tx_size.height()) as isize;
-        let (has_coeff, dist) = encode_tx_block(
+        let (has_coeff, dist) = encode_tx_block::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2546,7 +2525,7 @@ pub fn write_tx_tree<T: Pixel, W: Writer>(
   (partition_has_coeff, tx_dist)
 }
 
-pub fn encode_block_with_modes<T: Pixel, W: Writer>(
+pub fn encode_block_with_modes<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -2572,7 +2551,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
 
   let (tx_size, tx_type) = if !mode_decision.skip && !mode_decision.has_coeff {
     skip = true;
-    rdo_tx_size_type(
+    rdo_tx_size_type::<_, BD>(
       fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip,
     )
   } else {
@@ -2588,7 +2567,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
     tile_bo,
     skip,
   );
-  encode_block_post_cdef(
+  encode_block_post_cdef::<_, _, BD>(
     fi,
     ts,
     cw,
@@ -2612,7 +2591,7 @@ pub fn encode_block_with_modes<T: Pixel, W: Writer>(
   );
 }
 
-fn encode_partition_bottomup<T: Pixel, W: Writer>(
+fn encode_partition_bottomup<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset, ref_rd_cost: f64,
@@ -2673,7 +2652,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
     };
 
     let mode_decision =
-      rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+      rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
 
     if !mode_decision.pred_mode_luma.is_intra() {
       // Fill the saved motion structure
@@ -2693,7 +2672,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
     rdo_output.part_modes.push(mode_decision.clone());
 
     if !can_split {
-      encode_block_with_modes(
+      encode_block_with_modes::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -2783,7 +2762,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
         if offset.0.x >= ts.mi_width || offset.0.y >= ts.mi_height {
           continue;
         }
-        let child_rdo_output = encode_partition_bottomup(
+        let child_rdo_output = encode_partition_bottomup::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2856,7 +2835,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
         }
 
         // FIXME: redundant block re-encode
-        encode_block_with_modes(
+        encode_block_with_modes::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2895,7 +2874,7 @@ fn encode_partition_bottomup<T: Pixel, W: Writer>(
   rdo_output
 }
 
-fn encode_partition_topdown<T: Pixel, W: Writer>(
+fn encode_partition_topdown<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -2939,7 +2918,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
     debug_assert!(bsize.is_sqr());
 
     // Blocks of sizes within the supported range are subjected to a partitioning decision
-    rdo_output = rdo_partition_decision(
+    rdo_output = rdo_partition_decision::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -2977,7 +2956,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
       } else {
         // Make a prediction mode decision for blocks encoded with no rdo_partition_decision call (e.g. edges)
         rdo_decision =
-          rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+          rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
         &rdo_decision
       };
 
@@ -2997,7 +2976,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
       // NOTE: Cannot avoid calling rdo_tx_size_type() here again,
       // because, with top-down partition RDO, the neighboring contexts
       // of current partition can change, i.e. neighboring partitions can split down more.
-      let (tx_size, tx_type) = rdo_tx_size_type(
+      let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>(
         fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip,
       );
 
@@ -3105,7 +3084,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         tile_bo,
         skip,
       );
-      encode_block_post_cdef(
+      encode_block_post_cdef::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -3135,7 +3114,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         // The optimal prediction modes for each split block is known from an rdo_partition_decision() call
         for mode in rdo_output.part_modes {
           // Each block is subjected to a new splitting decision
-          encode_partition_topdown(
+          encode_partition_topdown::<_, _, BD>(
             fi,
             ts,
             cw,
@@ -3174,7 +3153,7 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
         let partitions = get_sub_partitions(&four_partitions, partition);
 
         partitions.iter().for_each(|&offset| {
-          encode_partition_topdown(
+          encode_partition_topdown::<_, _, BD>(
             fi,
             ts,
             cw,
@@ -3215,7 +3194,7 @@ fn get_initial_cdfcontext<T: Pixel>(fi: &FrameInvariants<T>) -> CDFContext {
 }
 
 #[hawktracer(encode_tile_group)]
-fn encode_tile_group<T: Pixel>(
+fn encode_tile_group<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) -> Vec<u8> {
   let planes =
@@ -3233,7 +3212,7 @@ fn encode_tile_group<T: Pixel>(
     .collect::<Vec<_>>()
     .into_par_iter()
     .map(|(mut ctx, cdf)| {
-      encode_tile(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg)
+      encode_tile::<_, BD>(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg)
     })
     .unzip();
 
@@ -3247,7 +3226,7 @@ fn encode_tile_group<T: Pixel>(
   /* TODO: Don't apply if lossless */
   let levels = fs.apply_tile_state_mut(|ts| {
     let rec = &mut ts.rec;
-    deblock_filter_optimize(
+    deblock_filter_optimize::<_, _, BD>(
       fi,
       &rec.as_const(),
       &ts.input.as_tile(),
@@ -3261,13 +3240,12 @@ fn encode_tile_group<T: Pixel>(
   if fs.deblock.levels[0] != 0 || fs.deblock.levels[1] != 0 {
     fs.apply_tile_state_mut(|ts| {
       let rec = &mut ts.rec;
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         ts.deblock,
         rec,
         &blocks.as_tile_blocks(),
         fi.width,
         fi.height,
-        fi.sequence.bit_depth,
         planes,
       );
     });
@@ -3282,11 +3260,16 @@ fn encode_tile_group<T: Pixel>(
     if fi.sequence.enable_cdef {
       fs.apply_tile_state_mut(|ts| {
         let rec = &mut ts.rec;
-        cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec);
+        cdef_filter_tile::<_, BD>(
+          fi,
+          &deblocked_frame,
+          &blocks.as_tile_blocks(),
+          rec,
+        );
       });
     }
     /* TODO: Don't apply if lossless */
-    fs.restoration.lrf_filter_frame(
+    fs.restoration.lrf_filter_frame::<_, BD>(
       Arc::get_mut(&mut fs.rec).unwrap(),
       &deblocked_frame,
       fi,
@@ -3297,7 +3280,12 @@ fn encode_tile_group<T: Pixel>(
       let deblocked_frame = (*fs.rec).clone();
       fs.apply_tile_state_mut(|ts| {
         let rec = &mut ts.rec;
-        cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec);
+        cdef_filter_tile::<_, BD>(
+          fi,
+          &deblocked_frame,
+          &blocks.as_tile_blocks(),
+          rec,
+        );
       });
     }
   }
@@ -3353,7 +3341,7 @@ pub struct SBSQueueEntry {
   pub w_post_cdef: WriterBase<WriterRecorder>,
 }
 
-fn check_lf_queue<T: Pixel>(
+fn check_lf_queue<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w: &mut WriterBase<WriterEncoder>,
   sbs_q: &mut VecDeque<SBSQueueEntry>, last_lru_ready: &mut [i32; 3],
@@ -3405,7 +3393,7 @@ fn check_lf_queue<T: Pixel>(
             }
           }
           if !already_rdoed {
-            rdo_loop_decision(qe.sbo, fi, ts, cw, w, deblock_p);
+            rdo_loop_decision::<_, _, BD>(qe.sbo, fi, ts, cw, w, deblock_p);
             for pli in 0..planes {
               if qe.lru_index[pli] != -1
                 && last_lru_rdoed[pli] < qe.lru_index[pli]
@@ -3445,7 +3433,7 @@ fn check_lf_queue<T: Pixel>(
 }
 
 #[hawktracer(encode_tile)]
-fn encode_tile<'a, T: Pixel>(
+fn encode_tile<'a, T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &'a mut TileStateMut<'_, T>,
   fc: &'a mut CDFContext, blocks: &'a mut TileBlocksMut<'a>,
   inter_cfg: &InterConfig,
@@ -3492,7 +3480,7 @@ fn encode_tile<'a, T: Pixel>(
         || is_straddle_sbx
         || is_straddle_sby
       {
-        encode_partition_bottomup(
+        encode_partition_bottomup::<_, _, BD>(
           fi,
           ts,
           &mut cw,
@@ -3505,7 +3493,7 @@ fn encode_tile<'a, T: Pixel>(
           &mut enc_stats,
         );
       } else {
-        encode_partition_topdown(
+        encode_partition_topdown::<_, _, BD>(
           fi,
           ts,
           &mut cw,
@@ -3547,7 +3535,7 @@ fn encode_tile<'a, T: Pixel>(
         sbs_q.push_back(sbs_qe);
 
         if check_queue && !fi.sequence.enable_delayed_loopfilter_rdo {
-          check_lf_queue(
+          check_lf_queue::<_, BD>(
             fi,
             ts,
             &mut cw,
@@ -3566,7 +3554,7 @@ fn encode_tile<'a, T: Pixel>(
   if fi.sequence.enable_delayed_loopfilter_rdo {
     // Solve deblocking for just this tile
     /* TODO: Don't apply if lossless */
-    let deblock_levels = deblock_filter_optimize(
+    let deblock_levels = deblock_filter_optimize::<_, _, BD>(
       fi,
       &ts.rec.as_const(),
       &ts.input_tile,
@@ -3592,18 +3580,17 @@ fn encode_tile<'a, T: Pixel>(
       deblock_copy.levels = deblock_levels;
 
       // temporarily deblock the reference
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         &deblock_copy,
         &mut ts.rec,
         &cw.bc.blocks.as_const(),
         fi.width,
         fi.height,
-        fi.sequence.bit_depth,
         planes,
       );
 
       // rdo lf and write
-      check_lf_queue(
+      check_lf_queue::<_, BD>(
         fi,
         ts,
         &mut cw,
@@ -3627,7 +3614,7 @@ fn encode_tile<'a, T: Pixel>(
       }
     } else {
       // rdo lf and write
-      check_lf_queue(
+      check_lf_queue::<_, BD>(
         fi,
         ts,
         &mut cw,
@@ -3743,7 +3730,7 @@ fn get_initial_segmentation<T: Pixel>(
 /// # Panics
 ///
 /// - If the frame packets cannot be written
-pub fn encode_frame<T: Pixel>(
+pub fn encode_frame<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, inter_cfg: &InterConfig,
 ) -> Vec<u8> {
   debug_assert!(!fi.is_show_existing_frame());
@@ -3753,9 +3740,9 @@ pub fn encode_frame<T: Pixel>(
 
   if fi.enable_segmentation {
     fs.segmentation = get_initial_segmentation(fi);
-    segmentation_optimize(fi, fs);
+    segmentation_optimize::<_, BD>(fi, fs);
   }
-  let tile_group = encode_tile_group(fi, fs, inter_cfg);
+  let tile_group = encode_tile_group::<_, BD>(fi, fs, inter_cfg);
 
   if fi.frame_type == FrameType::KEY {
     write_key_frame_obus(&mut packet, fi, obu_extension).unwrap();
diff --git a/src/lrf.rs b/src/lrf.rs
index f33a48826e..c793310498 100644
--- a/src/lrf.rs
+++ b/src/lrf.rs
@@ -626,7 +626,7 @@ pub fn setup_integral_image<T: Pixel>(
   }
 }
 
-pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
+pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel, const BD: usize>(
   set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
   integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
   cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
@@ -647,19 +647,6 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
   let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
 
-  let fn_ab_r1 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r1::<8>,
-    10 => sgrproj_box_ab_r1::<10>,
-    12 => sgrproj_box_ab_r1::<12>,
-    _ => unimplemented!(),
-  };
-  let fn_ab_r2 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r2::<8>,
-    10 => sgrproj_box_ab_r2::<10>,
-    12 => sgrproj_box_ab_r2::<12>,
-    _ => unimplemented!(),
-  };
-
   /* prime the intermediate arrays */
   // One oddness about the radius=2 intermediate array computations that
   // the spec doesn't make clear: Although the spec defines computation
@@ -668,7 +655,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   let integral_image = &integral_image_buffer.integral_image;
   let sq_integral_image = &integral_image_buffer.sq_integral_image;
   if s_r2 > 0 {
-    fn_ab_r2(
+    sgrproj_box_ab_r2::<BD>(
       &mut a_r2[0],
       &mut b_r2[0],
       integral_image,
@@ -682,7 +669,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   }
   if s_r1 > 0 {
     let integral_image_offset = integral_image_stride + 1;
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[0],
       &mut b_r1[0],
       &integral_image[integral_image_offset..],
@@ -693,7 +680,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       s_r1,
       fi.cpu_feature_level,
     );
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[1],
       &mut b_r1[1],
       &integral_image[integral_image_offset..],
@@ -712,7 +699,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
   for y in (0..stripe_h).step_by(2) {
     // get results to use y and y+1
     let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
-      fn_ab_r2(
+      sgrproj_box_ab_r2::<BD>(
         &mut a_r2[(y / 2 + 1) % 2],
         &mut b_r2[(y / 2 + 1) % 2],
         integral_image,
@@ -751,7 +738,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       let y = y + dy;
       if s_r1 > 0 {
         let integral_image_offset = integral_image_stride + 1;
-        fn_ab_r1(
+        sgrproj_box_ab_r1::<BD>(
           &mut a_r1[(y + 2) % 3],
           &mut b_r1[(y + 2) % 3],
           &integral_image[integral_image_offset..],
@@ -793,9 +780,9 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
       let line = &cdeffed[y];
 
       #[inline(always)]
-      fn apply_filter<U: Pixel>(
+      fn apply_filter<U: Pixel, const BD: usize>(
         out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
-        stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
+        stripe_w: usize, w0: i32, w1: i32, w2: i32,
       ) {
         let line_it = line[..stripe_w].iter();
         let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
@@ -809,17 +796,16 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
           let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
           let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
             >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
-          *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
+          *o = U::cast_from(clamp(s, 0, (1 << BD) - 1));
         }
       }
 
-      apply_filter(
+      apply_filter::<_, BD>(
         &mut out[y],
         line,
         &f_r1,
         f_r2_ab[dy],
         stripe_w,
-        fi.sequence.bit_depth,
         w0,
         w1,
         w2,
@@ -842,7 +828,7 @@ pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
 
 // Input params follow the same rules as sgrproj_stripe_filter.
 // Inputs are relative to the colocated slice views.
-pub fn sgrproj_solve<T: Pixel>(
+pub fn sgrproj_solve<T: Pixel, const BD: usize>(
   set: u8, fi: &FrameInvariants<T>,
   integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
   cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
@@ -865,19 +851,6 @@ pub fn sgrproj_solve<T: Pixel>(
   let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
   let mut c: [f64; 2] = [0., 0.];
 
-  let fn_ab_r1 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r1::<8>,
-    10 => sgrproj_box_ab_r1::<10>,
-    12 => sgrproj_box_ab_r1::<12>,
-    _ => unimplemented!(),
-  };
-  let fn_ab_r2 = match fi.sequence.bit_depth {
-    8 => sgrproj_box_ab_r2::<8>,
-    10 => sgrproj_box_ab_r2::<10>,
-    12 => sgrproj_box_ab_r2::<12>,
-    _ => unimplemented!(),
-  };
-
   /* prime the intermediate arrays */
   // One oddness about the radius=2 intermediate array computations that
   // the spec doesn't make clear: Although the spec defines computation
@@ -886,7 +859,7 @@ pub fn sgrproj_solve<T: Pixel>(
   let integral_image = &integral_image_buffer.integral_image;
   let sq_integral_image = &integral_image_buffer.sq_integral_image;
   if s_r2 > 0 {
-    fn_ab_r2(
+    sgrproj_box_ab_r2::<BD>(
       &mut a_r2[0],
       &mut b_r2[0],
       integral_image,
@@ -900,7 +873,7 @@ pub fn sgrproj_solve<T: Pixel>(
   }
   if s_r1 > 0 {
     let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[0],
       &mut b_r1[0],
       &integral_image[integral_image_offset..],
@@ -911,7 +884,7 @@ pub fn sgrproj_solve<T: Pixel>(
       s_r1,
       fi.cpu_feature_level,
     );
-    fn_ab_r1(
+    sgrproj_box_ab_r1::<BD>(
       &mut a_r1[1],
       &mut b_r1[1],
       &integral_image[integral_image_offset..],
@@ -930,7 +903,7 @@ pub fn sgrproj_solve<T: Pixel>(
   for y in (0..cdef_h).step_by(2) {
     // get results to use y and y+1
     let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
-      fn_ab_r2(
+      sgrproj_box_ab_r2::<BD>(
         &mut a_r2[(y / 2 + 1) % 2],
         &mut b_r2[(y / 2 + 1) % 2],
         integral_image,
@@ -963,7 +936,7 @@ pub fn sgrproj_solve<T: Pixel>(
       let y = y + dy;
       if s_r1 > 0 {
         let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
-        fn_ab_r1(
+        sgrproj_box_ab_r1::<BD>(
           &mut a_r1[(y + 2) % 3],
           &mut b_r1[(y + 2) % 3],
           &integral_image[integral_image_offset..],
@@ -1093,16 +1066,15 @@ pub fn sgrproj_solve<T: Pixel>(
   }
 }
 
-fn wiener_stripe_filter<T: Pixel>(
-  coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
-  stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
-  cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
+fn wiener_stripe_filter<T: Pixel, const BD: usize>(
+  coeffs: [[i8; 3]; 2], crop_w: usize, crop_h: usize, stripe_w: usize,
+  stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane<T>,
+  deblocked: &Plane<T>, out: &mut Plane<T>,
 ) {
-  let bit_depth = fi.sequence.bit_depth;
-  let round_h = if bit_depth == 12 { 5 } else { 3 };
-  let round_v = if bit_depth == 12 { 9 } else { 11 };
-  let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
-  let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
+  let round_h = if BD == 12 { 5 } else { 3 };
+  let round_v = if BD == 12 { 9 } else { 11 };
+  let offset = 1 << (BD + WIENER_BITS - round_h - 1);
+  let limit = (1 << (BD + 1 + WIENER_BITS - round_h)) - 1;
 
   let mut coeffs_ = [[0; 3]; 2];
   for i in 0..2 {
@@ -1197,7 +1169,7 @@ fn wiener_stripe_filter<T: Pixel>(
       *dst = T::cast_from(clamp(
         (acc + (1 << round_v >> 1)) >> round_v,
         0,
-        (1 << bit_depth) - 1,
+        (1 << BD) - 1,
       ));
     }
   }
@@ -1482,7 +1454,7 @@ impl RestorationState {
   }
 
   #[hawktracer(lrf_filter_frame)]
-  pub fn lrf_filter_frame<T: Pixel>(
+  pub fn lrf_filter_frame<T: Pixel, const BD: usize>(
     &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
     fi: &FrameInvariants<T>,
   ) {
@@ -1530,9 +1502,8 @@ impl RestorationState {
           let ru = rp.restoration_unit_by_stripe(si, rux);
           match ru.filter {
             RestorationFilter::Wiener { coeffs } => {
-              wiener_stripe_filter(
+              wiener_stripe_filter::<_, BD>(
                 coeffs,
-                fi,
                 crop_w,
                 crop_h,
                 size,
@@ -1562,7 +1533,7 @@ impl RestorationState {
                   .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
               );
 
-              sgrproj_stripe_filter(
+              sgrproj_stripe_filter::<_, _, BD>(
                 set,
                 xqd,
                 fi,
diff --git a/src/mc.rs b/src/mc.rs
index d9edde259b..45981cc6c9 100644
--- a/src/mc.rs
+++ b/src/mc.rs
@@ -247,10 +247,10 @@ pub(crate) mod rust {
   }
 
   #[cold_for_target_arch("x86_64")]
-  pub fn put_8tap<T: Pixel>(
+  pub fn put_8tap<T: Pixel, const BD: usize>(
     dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize,
     height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
-    mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel,
+    mode_y: FilterMode, _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
@@ -259,8 +259,8 @@ pub(crate) mod rust {
     let ref_stride = src.plane.cfg.stride;
     let y_filter = get_filter(mode_y, row_frac, height);
     let x_filter = get_filter(mode_x, col_frac, width);
-    let max_sample_val = (1 << bit_depth) - 1;
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
+    let max_sample_val = (1 << BD) - 1;
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
     match (col_frac, row_frac) {
       (0, 0) => {
         for r in 0..height {
@@ -357,10 +357,10 @@ pub(crate) mod rust {
   const PREP_BIAS: i32 = 8192;
 
   #[cold_for_target_arch("x86_64")]
-  pub fn prep_8tap<T: Pixel>(
+  pub fn prep_8tap<T: Pixel, const BD: usize>(
     tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize,
     col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
-    bit_depth: usize, _cpu: CpuFeatureLevel,
+    _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
@@ -369,8 +369,8 @@ pub(crate) mod rust {
     let ref_stride = src.plane.cfg.stride;
     let y_filter = get_filter(mode_y, row_frac, height);
     let x_filter = get_filter(mode_x, col_frac, width);
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
-    let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS };
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
+    let prep_bias = if BD == 8 { 0 } else { PREP_BIAS };
     match (col_frac, row_frac) {
       (0, 0) => {
         for r in 0..height {
@@ -451,17 +451,17 @@ pub(crate) mod rust {
   }
 
   #[cold_for_target_arch("x86_64")]
-  pub fn mc_avg<T: Pixel>(
+  pub fn mc_avg<T: Pixel, const BD: usize>(
     dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize,
-    height: usize, bit_depth: usize, _cpu: CpuFeatureLevel,
+    height: usize, _cpu: CpuFeatureLevel,
   ) {
     // The assembly only supports even heights and valid uncropped widths
     assert_eq!(height & 1, 0);
     assert!(width.is_power_of_two() && (2..=128).contains(&width));
 
-    let max_sample_val = (1 << bit_depth) - 1;
-    let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
-    let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 };
+    let max_sample_val = (1 << BD) - 1;
+    let intermediate_bits = 4 - if BD == 12 { 2 } else { 0 };
+    let prep_bias = if BD == 8 { 0 } else { PREP_BIAS * 2 };
     for r in 0..height {
       let dst_slice = &mut dst[r];
       for c in 0..width {
diff --git a/src/me.rs b/src/me.rs
index a6b09e9f03..ecbcdfa398 100644
--- a/src/me.rs
+++ b/src/me.rs
@@ -154,7 +154,7 @@ pub enum MVSamplingMode {
 }
 
 #[hawktracer(estimate_tile_motion)]
-pub fn estimate_tile_motion<T: Pixel>(
+pub fn estimate_tile_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   inter_cfg: &InterConfig,
 ) {
@@ -194,7 +194,7 @@ pub fn estimate_tile_motion<T: Pixel>(
               .block_offset(0, 0);
 
           if new_subsampling {
-            refine_subsampled_sb_motion(
+            refine_subsampled_sb_motion::<_, BD>(
               fi,
               ts,
               ref_frame,
@@ -205,7 +205,7 @@ pub fn estimate_tile_motion<T: Pixel>(
             );
           }
 
-          estimate_sb_motion(
+          estimate_sb_motion::<_, BD>(
             fi,
             ts,
             ref_frame,
@@ -221,7 +221,7 @@ pub fn estimate_tile_motion<T: Pixel>(
   }
 }
 
-fn estimate_sb_motion<T: Pixel>(
+fn estimate_sb_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType,
   mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8,
   lambda: u32,
@@ -257,7 +257,7 @@ fn estimate_sb_motion<T: Pixel>(
       // Run motion estimation.
       // Note that the initial search (init) instructs the called function to
       // perform a more extensive search.
-      if let Some(results) = estimate_motion(
+      if let Some(results) = estimate_motion::<_, BD>(
         fi,
         ts,
         w,
@@ -285,7 +285,7 @@ fn estimate_sb_motion<T: Pixel>(
   }
 }
 
-fn refine_subsampled_sb_motion<T: Pixel>(
+fn refine_subsampled_sb_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType,
   mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32,
 ) {
@@ -307,7 +307,7 @@ fn refine_subsampled_sb_motion<T: Pixel>(
       let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec;
 
       // Refine the existing motion estimate
-      if let Some(results) = refine_subsampled_motion_estimate(
+      if let Some(results) = refine_subsampled_motion_estimate::<_, BD>(
         fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda,
       ) {
         // normalize sad to 128x128 block
@@ -536,7 +536,7 @@ fn get_subset_predictors(
   MotionEstimationSubsets { min_sad, median, subset_b, subset_c }
 }
 
-pub fn estimate_motion<T: Pixel>(
+pub fn estimate_motion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize,
   tile_bo: TileBlockOffset, ref_frame: RefType,
   pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode,
@@ -575,7 +575,7 @@ pub fn estimate_motion<T: Pixel>(
       _ => unimplemented!(),
     };
 
-    let mut best: MotionSearchResult = full_pixel_me(
+    let mut best: MotionSearchResult = full_pixel_me::<_, BD>(
       fi,
       ts,
       org_region,
@@ -599,26 +599,13 @@ pub fn estimate_motion<T: Pixel>(
     if let Some(pmv) = pmv {
       let use_satd: bool = fi.config.speed_settings.motion.use_satd_subpel;
       if use_satd {
-        best.rd = get_fullpel_mv_rd(
-          fi,
-          po,
-          org_region,
-          p_ref,
-          fi.sequence.bit_depth,
-          pmv,
-          lambda,
-          use_satd,
-          mvx_min,
-          mvx_max,
-          mvy_min,
-          mvy_max,
-          w,
-          h,
-          best.mv,
+        best.rd = get_fullpel_mv_rd::<_, BD>(
+          fi, po, org_region, p_ref, pmv, lambda, use_satd, mvx_min, mvx_max,
+          mvy_min, mvy_max, w, h, best.mv,
         );
       }
 
-      sub_pixel_me(
+      sub_pixel_me::<_, BD>(
         fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min,
         mvy_max, w, h, use_satd, &mut best, ref_frame,
       );
@@ -634,7 +621,7 @@ pub fn estimate_motion<T: Pixel>(
 }
 
 /// Refine motion estimation that was computed one level of subsampling up.
-fn refine_subsampled_motion_estimate<T: Pixel>(
+fn refine_subsampled_motion_estimate<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize,
   tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32,
 ) -> Option<MotionSearchResult> {
@@ -679,7 +666,7 @@ fn refine_subsampled_motion_estimate<T: Pixel>(
     let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8);
     let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8);
     let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8);
-    let mut results = full_search(
+    let mut results = full_search::<_, BD>(
       fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv,
     );
 
@@ -692,7 +679,7 @@ fn refine_subsampled_motion_estimate<T: Pixel>(
   }
 }
 
-fn full_pixel_me<T: Pixel>(
+fn full_pixel_me<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
   org_region: &PlaneRegion<T>, p_ref: &Plane<T>, tile_bo: TileBlockOffset,
   po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize,
@@ -722,29 +709,16 @@ fn full_pixel_me<T: Pixel>(
 
   let try_cands = |predictors: &[MotionVector],
                    best: &mut MotionSearchResult| {
-    let mut results = get_best_predictor(
-      fi,
-      po,
-      org_region,
-      p_ref,
-      predictors,
-      fi.sequence.bit_depth,
-      pmv,
-      lambda,
-      mvx_min,
-      mvx_max,
-      mvy_min,
-      mvy_max,
-      w,
-      h,
+    let mut results = get_best_predictor::<_, BD>(
+      fi, po, org_region, p_ref, predictors, pmv, lambda, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h,
     );
-    fullpel_diamond_search(
+    fullpel_diamond_search::<_, BD>(
       fi,
       po,
       org_region,
       p_ref,
       &mut results,
-      fi.sequence.bit_depth,
       pmv,
       lambda,
       mvx_min,
@@ -770,8 +744,8 @@ fn full_pixel_me<T: Pixel>(
     // from the previous frame. Stop once a candidate with a sad less than a
     // threshold is found.
 
-    let thresh = (subsets.min_sad as f32 * 1.2) as u32
-      + (((w * h) as u32) << (fi.sequence.bit_depth - 8));
+    let thresh =
+      (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (BD - 8));
 
     if let Some(median) = subsets.median {
       try_cands(&[median], &mut best);
@@ -795,21 +769,9 @@ fn full_pixel_me<T: Pixel>(
 
     // Preform UMH search, either as the last possible search when full search
     // is disabled, or as the last search before resorting to full search.
-    uneven_multi_hex_search(
-      fi,
-      po,
-      org_region,
-      p_ref,
-      &mut best,
-      fi.sequence.bit_depth,
-      pmv,
-      lambda,
-      mvx_min,
-      mvx_max,
-      mvy_min,
-      mvy_max,
-      w,
-      h,
+    uneven_multi_hex_search::<_, BD>(
+      fi, po, org_region, p_ref, &mut best, pmv, lambda, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h,
       // Use 24, since it is the largest range that x264 uses.
       24,
     );
@@ -829,7 +791,7 @@ fn full_pixel_me<T: Pixel>(
       let y_lo = po.y + (-range_y).max(mvy_min / 8);
       let y_hi = po.y + (range_y).min(mvy_max / 8);
 
-      let results = full_search(
+      let results = full_search::<_, BD>(
         fi,
         x_lo,
         x_hi,
@@ -857,44 +819,30 @@ fn full_pixel_me<T: Pixel>(
   }
 }
 
-fn sub_pixel_me<T: Pixel>(
+fn sub_pixel_me<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
   p_ref: &Plane<T>, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize,
   mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
   use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType,
 ) {
-  subpel_diamond_search(
-    fi,
-    po,
-    org_region,
-    p_ref,
-    fi.sequence.bit_depth,
-    pmv,
-    lambda,
-    mvx_min,
-    mvx_max,
-    mvy_min,
-    mvy_max,
-    w,
-    h,
-    use_satd,
-    best,
-    ref_frame,
+  subpel_diamond_search::<_, BD>(
+    fi, po, org_region, p_ref, pmv, lambda, mvx_min, mvx_max, mvy_min,
+    mvy_max, w, h, use_satd, best, ref_frame,
   );
 }
 
-fn get_best_predictor<T: Pixel>(
+fn get_best_predictor<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, predictors: &[MotionVector], bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, predictors: &[MotionVector], pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) -> MotionSearchResult {
   let mut best: MotionSearchResult = MotionSearchResult::empty();
 
   for &init_mv in predictors.iter() {
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, init_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, init_mv,
     );
 
     if rd.cost < best.rd.cost {
@@ -953,11 +901,11 @@ const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!(
 /// For each step size, candidate motion vectors are examined for improvement
 /// to the current search location. The search location is moved to the best
 /// candidate (if any). This is repeated until the search location stops moving.
-fn fullpel_diamond_search<T: Pixel>(
+fn fullpel_diamond_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) {
   // Define the initial and the final scale (log2) of the diamond.
   let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8);
@@ -967,9 +915,9 @@ fn fullpel_diamond_search<T: Pixel>(
     let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
     for &offset in &DIAMOND_R1_PATTERN {
       let cand_mv = current.mv + (offset << diamond_radius_log2);
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < best_cand.rd.cost {
@@ -1052,11 +1000,11 @@ const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!(
 ///
 /// `current` provides the initial search location and serves as
 /// the output for the final search results.
-fn hexagon_search<T: Pixel>(
+fn hexagon_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize,
 ) {
   // The first iteration of hexagon search is implemented separate from
   // subsequent iterations, which overlap with previous iterations.
@@ -1070,9 +1018,9 @@ fn hexagon_search<T: Pixel>(
   // First iteration of hexagon search. There are six candidates to consider.
   for i in 0..6 {
     let cand_mv = current.mv + HEXAGON_PATTERN[i];
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, cand_mv,
     );
 
     if rd.cost < best_cand.rd.cost {
@@ -1102,9 +1050,9 @@ fn hexagon_search<T: Pixel>(
       let i = (center_cand_idx + idx_offset_mod6) % 6;
       let cand_mv = current.mv + HEXAGON_PATTERN[i];
 
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < best_cand.rd.cost {
@@ -1119,9 +1067,9 @@ fn hexagon_search<T: Pixel>(
   let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
   for &offset in &SQUARE_REFINE_PATTERN {
     let cand_mv = current.mv + offset;
-    let rd = get_fullpel_mv_rd(
-      fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-      mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+    let rd = get_fullpel_mv_rd::<_, BD>(
+      fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+      mvy_min, mvy_max, w, h, cand_mv,
     );
 
     if rd.cost < best_cand.rd.cost {
@@ -1166,11 +1114,11 @@ const UMH_PATTERN: [MotionVector; 16] = search_pattern!(
 /// the output for the final search results.
 ///
 /// `me_range` parameter determines how far these stages can search.
-fn uneven_multi_hex_search<T: Pixel>(
+fn uneven_multi_hex_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize,
-  pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize,
-  mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16,
+  p_ref: &Plane<T>, current: &mut MotionSearchResult, pmv: [MotionVector; 2],
+  lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize,
+  w: usize, h: usize, me_range: i16,
 ) {
   assert!(!current.is_empty());
 
@@ -1199,9 +1147,9 @@ fn uneven_multi_hex_search<T: Pixel>(
 
     for &offset in &HORIZONTAL_LINE {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1220,9 +1168,9 @@ fn uneven_multi_hex_search<T: Pixel>(
 
     for &offset in &VERTICAL_LINE {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1240,9 +1188,9 @@ fn uneven_multi_hex_search<T: Pixel>(
         continue;
       }
       let cand_mv = center + MotionVector { row, col };
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1282,9 +1230,9 @@ fn uneven_multi_hex_search<T: Pixel>(
   for i in 1..=iterations {
     for &offset in &UMH_PATTERN {
       let cand_mv = center + offset * i;
-      let rd = get_fullpel_mv_rd(
-        fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min,
-        mvx_max, mvy_min, mvy_max, w, h, cand_mv,
+      let rd = get_fullpel_mv_rd::<_, BD>(
+        fi, po, org_region, p_ref, pmv, lambda, false, mvx_min, mvx_max,
+        mvy_min, mvy_max, w, h, cand_mv,
       );
 
       if rd.cost < current.rd.cost {
@@ -1295,9 +1243,9 @@ fn uneven_multi_hex_search<T: Pixel>(
   }
 
   // Refine the search results using a 'normal' hexagon search.
-  hexagon_search(
-    fi, po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min,
-    mvx_max, mvy_min, mvy_max, w, h,
+  hexagon_search::<_, BD>(
+    fi, po, org_region, p_ref, current, pmv, lambda, mvx_min, mvx_max,
+    mvy_min, mvy_max, w, h,
   );
 }
 
@@ -1306,12 +1254,11 @@ fn uneven_multi_hex_search<T: Pixel>(
 /// For each step size, candidate motion vectors are examined for improvement
 /// to the current search location. The search location is moved to the best
 /// candidate (if any). This is repeated until the search location stops moving.
-fn subpel_diamond_search<T: Pixel>(
+fn subpel_diamond_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  _p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32,
-  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
-  h: usize, use_satd: bool, current: &mut MotionSearchResult,
-  ref_frame: RefType,
+  _p_ref: &Plane<T>, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize,
+  mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  use_satd: bool, current: &mut MotionSearchResult, ref_frame: RefType,
 ) {
   use crate::util::Aligned;
 
@@ -1340,11 +1287,10 @@ fn subpel_diamond_search<T: Pixel>(
     for &offset in &DIAMOND_R1_PATTERN_SUBPEL {
       let cand_mv = current.mv + (offset << diamond_radius_log2);
 
-      let rd = get_subpel_mv_rd(
+      let rd = get_subpel_mv_rd::<_, BD>(
         fi,
         po,
         org_region,
-        bit_depth,
         pmv,
         lambda,
         use_satd,
@@ -1381,11 +1327,11 @@ fn subpel_diamond_search<T: Pixel>(
 }
 
 #[inline]
-fn get_fullpel_mv_rd<T: Pixel>(
+fn get_fullpel_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32,
-  use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize,
-  mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector,
+  p_ref: &Plane<T>, pmv: [MotionVector; 2], lambda: u32, use_satd: bool,
+  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
+  h: usize, cand_mv: MotionVector,
 ) -> MVCandidateRD {
   if (cand_mv.col as isize) < mvx_min
     || (cand_mv.col as isize) > mvx_max
@@ -1400,17 +1346,16 @@ fn get_fullpel_mv_rd<T: Pixel>(
     x: po.x + (cand_mv.col / 8) as isize,
     y: po.y + (cand_mv.row / 8) as isize,
   });
-  compute_mv_rd(
-    fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region,
-    &plane_ref,
+  compute_mv_rd::<_, BD>(
+    fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref,
   )
 }
 
-fn get_subpel_mv_rd<T: Pixel>(
+fn get_subpel_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>,
-  bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool,
-  mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize,
-  h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut<T>,
+  pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize,
+  mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize,
+  cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut<T>,
   ref_frame: RefType,
 ) -> MVCandidateRD {
   if (cand_mv.col as isize) < mvx_min
@@ -1426,29 +1371,28 @@ fn get_subpel_mv_rd<T: Pixel>(
   let tile_rect =
     TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height };
 
-  PredictionMode::NEWMV.predict_inter_single(
+  PredictionMode::NEWMV.predict_inter_single::<_, BD>(
     fi, tile_rect, 0, po, tmp_region,
     // motion comp's w & h on edges can be different than distortion's
     tmp_width, tmp_height, ref_frame, cand_mv,
   );
   let plane_ref = tmp_region.as_const();
-  compute_mv_rd(
-    fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region,
-    &plane_ref,
+  compute_mv_rd::<_, BD>(
+    fi, pmv, lambda, use_satd, w, h, cand_mv, org_region, &plane_ref,
   )
 }
 
 /// Compute the rate distortion stats for a motion vector.
 #[inline(always)]
-fn compute_mv_rd<T: Pixel>(
+fn compute_mv_rd<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, pmv: [MotionVector; 2], lambda: u32,
-  use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector,
+  use_satd: bool, w: usize, h: usize, cand_mv: MotionVector,
   plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>,
 ) -> MVCandidateRD {
   let sad = if use_satd {
-    get_satd(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level)
+    get_satd::<_, BD>(plane_org, plane_ref, w, h, fi.cpu_feature_level)
   } else {
-    get_sad(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level)
+    get_sad(plane_org, plane_ref, w, h, fi.cpu_feature_level)
   };
 
   let rate1 = get_mv_rate(cand_mv, pmv[0], fi.allow_high_precision_mv);
@@ -1458,7 +1402,7 @@ fn compute_mv_rd<T: Pixel>(
   MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad }
 }
 
-fn full_search<T: Pixel>(
+fn full_search<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize,
   w: usize, h: usize, org_region: &PlaneRegion<T>, p_ref: &Plane<T>,
   po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2],
@@ -1482,12 +1426,11 @@ fn full_search<T: Pixel>(
         col: 8 * (x as i16 - po.x as i16),
       };
 
-      let rd = compute_mv_rd(
+      let rd = compute_mv_rd::<_, BD>(
         fi,
         pmv,
         lambda,
         false,
-        fi.sequence.bit_depth,
         w,
         h,
         mv,
diff --git a/src/partition.rs b/src/partition.rs
index e64de6e9ad..22b800b646 100644
--- a/src/partition.rs
+++ b/src/partition.rs
@@ -591,7 +591,7 @@ fn supersample_chroma_bsize(
   }
 }
 
-pub fn get_intra_edges<T: Pixel>(
+pub fn get_intra_edges<T: Pixel, const BD: usize>(
   dst: &PlaneRegion<'_, T>,
   partition_bo: TileBlockOffset, // partition bo, BlockOffset
   bx: usize,
@@ -599,7 +599,6 @@ pub fn get_intra_edges<T: Pixel>(
   partition_size: BlockSize, // partition size, BlockSize
   po: PlaneOffset,
   tx_size: TxSize,
-  bit_depth: usize,
   opt_mode: Option<PredictionMode>,
   enable_intra_edge_filter: bool,
   intra_param: IntraParam,
@@ -610,7 +609,7 @@ pub fn get_intra_edges<T: Pixel>(
   let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> =
     unsafe { Aligned::uninitialized() };
   //Aligned::new([T::cast_from(0); 4 * MAX_TX_SIZE + 1]);
-  let base = 128u16 << (bit_depth - 8);
+  let base = 128u16 << (BD - 8);
 
   {
     // left pixels are ordered from bottom to top and right-aligned
diff --git a/src/predict.rs b/src/predict.rs
index 632196c72a..0c3098233d 100644
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -205,9 +205,9 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an inter `PredictionMode`
-  pub fn predict_intra<T: Pixel>(
+  pub fn predict_intra<T: Pixel, const BD: usize>(
     self, tile_rect: TileRect, dst: &mut PlaneRegionMut<'_, T>,
-    tx_size: TxSize, bit_depth: usize, ac: &[i16], intra_param: IntraParam,
+    tx_size: TxSize, ac: &[i16], intra_param: IntraParam,
     ief_params: Option<IntraEdgeFilterParameters>,
     edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, cpu: CpuFeatureLevel,
   ) {
@@ -245,9 +245,8 @@ impl PredictionMode {
       _ => intra_mode_to_angle(mode) + (angle_delta * ANGLE_STEP) as isize,
     };
 
-    dispatch_predict_intra::<T>(
-      mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf,
-      cpu,
+    dispatch_predict_intra::<T, BD>(
+      mode, variant, dst, tx_size, ac, angle, ief_params, edge_buf, cpu,
     );
   }
 
@@ -304,7 +303,7 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an intra `PredictionMode`
-  pub fn predict_inter_single<T: Pixel>(
+  pub fn predict_inter_single<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frame: RefType, mv: MotionVector,
@@ -319,7 +318,7 @@ impl PredictionMode {
     {
       let (row_frac, col_frac, src) =
         PredictionMode::get_mv_params(&rec.frame.planes[p], frame_po, mv);
-      put_8tap(
+      put_8tap::<_, BD>(
         dst,
         src,
         width,
@@ -328,7 +327,6 @@ impl PredictionMode {
         row_frac,
         mode,
         mode,
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       );
     }
@@ -339,7 +337,7 @@ impl PredictionMode {
   /// # Panics
   ///
   /// - If called on an intra `PredictionMode`
-  pub fn predict_inter_compound<T: Pixel>(
+  pub fn predict_inter_compound<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -359,7 +357,7 @@ impl PredictionMode {
           frame_po,
           mvs[i],
         );
-        prep_8tap(
+        prep_8tap::<_, BD>(
           buffer.get_buffer_mut(i),
           src,
           width,
@@ -368,25 +366,23 @@ impl PredictionMode {
           row_frac,
           mode,
           mode,
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         );
       }
     }
-    mc_avg(
+    mc_avg::<_, BD>(
       dst,
       buffer.get_buffer(0),
       buffer.get_buffer(1),
       width,
       height,
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     );
   }
 
   /// Inter prediction that determines whether compound mode is being used based
   /// on the second [`RefType`] in [`ref_frames`].
-  pub fn predict_inter<T: Pixel>(
+  pub fn predict_inter<T: Pixel, const BD: usize>(
     self, fi: &FrameInvariants<T>, tile_rect: TileRect, p: usize,
     po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize,
     height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -396,7 +392,7 @@ impl PredictionMode {
       && ref_frames[1] != RefType::NONE_FRAME;
 
     if !is_compound {
-      self.predict_inter_single(
+      self.predict_inter_single::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -408,7 +404,7 @@ impl PredictionMode {
         mvs[0],
       )
     } else {
-      self.predict_inter_compound(
+      self.predict_inter_compound::<_, BD>(
         fi,
         tile_rect,
         p,
@@ -698,10 +694,10 @@ pub(crate) mod rust {
   use std::mem::size_of;
 
   #[inline(always)]
-  pub fn dispatch_predict_intra<T: Pixel>(
+  pub fn dispatch_predict_intra<T: Pixel, const BD: usize>(
     mode: PredictionMode, variant: PredictionVariant,
-    dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize,
-    ac: &[i16], angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
+    dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, ac: &[i16],
+    angle: isize, ief_params: Option<IntraEdgeFilterParameters>,
     edge_buf: &Aligned<[T; 4 * MAX_TX_SIZE + 1]>, _cpu: CpuFeatureLevel,
   ) {
     let width = tx_size.width();
@@ -718,11 +714,11 @@ pub(crate) mod rust {
     match mode {
       PredictionMode::DC_PRED => {
         (match variant {
-          PredictionVariant::NONE => pred_dc_128,
+          PredictionVariant::NONE => pred_dc_128::<_, BD>,
           PredictionVariant::LEFT => pred_dc_left,
           PredictionVariant::TOP => pred_dc_top,
           PredictionVariant::BOTH => pred_dc,
-        })(dst, above_slice, left_slice, width, height, bit_depth)
+        })(dst, above_slice, left_slice, width, height)
       }
       PredictionMode::V_PRED if angle == 90 => {
         pred_v(dst, above_slice, width, height)
@@ -737,7 +733,7 @@ pub(crate) mod rust {
       | PredictionMode::D113_PRED
       | PredictionMode::D157_PRED
       | PredictionMode::D203_PRED
-      | PredictionMode::D67_PRED => pred_directional(
+      | PredictionMode::D67_PRED => pred_directional::<_, BD>(
         dst,
         above_slice,
         left_and_left_below_slice,
@@ -745,7 +741,6 @@ pub(crate) mod rust {
         angle as usize,
         width,
         height,
-        bit_depth,
         ief_params,
       ),
       PredictionMode::SMOOTH_PRED => {
@@ -760,28 +755,23 @@ pub(crate) mod rust {
       PredictionMode::PAETH_PRED => {
         pred_paeth(dst, above_slice, left_slice, top_left[0], width, height)
       }
-      PredictionMode::UV_CFL_PRED => (match variant {
-        PredictionVariant::NONE => pred_cfl_128,
-        PredictionVariant::LEFT => pred_cfl_left,
-        PredictionVariant::TOP => pred_cfl_top,
-        PredictionVariant::BOTH => pred_cfl,
-      })(
-        dst,
-        ac,
-        angle as i16,
-        above_slice,
-        left_slice,
-        width,
-        height,
-        bit_depth,
-      ),
+      PredictionMode::UV_CFL_PRED => {
+        (match variant {
+          PredictionVariant::NONE => pred_cfl_128::<_, BD>,
+          PredictionVariant::LEFT => pred_cfl_left::<_, BD>,
+          PredictionVariant::TOP => pred_cfl_top::<_, BD>,
+          PredictionVariant::BOTH => pred_cfl::<_, BD>,
+        })(
+          dst, ac, angle as i16, above_slice, left_slice, width, height
+        )
+      }
       _ => unimplemented!(),
     }
   }
 
   pub(crate) fn pred_dc<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize,
-    height: usize, _bit_depth: usize,
+    height: usize,
   ) {
     let edges = left[..height].iter().chain(above[..width].iter());
     let len = (width + height) as u32;
@@ -797,11 +787,11 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_dc_128<T: Pixel>(
+  pub(crate) fn pred_dc_128<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, _above: &[T], _left: &[T],
-    width: usize, height: usize, bit_depth: usize,
+    width: usize, height: usize,
   ) {
-    let v = T::cast_from(128u32 << (bit_depth - 8));
+    let v = T::cast_from(128u32 << (BD - 8));
     for line in output.rows_iter_mut().take(height) {
       line[..width].fill(v);
     }
@@ -809,7 +799,7 @@ pub(crate) mod rust {
 
   pub(crate) fn pred_dc_left<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, _above: &[T], left: &[T],
-    width: usize, height: usize, _bit_depth: usize,
+    width: usize, height: usize,
   ) {
     let sum = left[..].iter().fold(0u32, |acc, &v| {
       let v: u32 = v.into();
@@ -823,7 +813,7 @@ pub(crate) mod rust {
 
   pub(crate) fn pred_dc_top<T: Pixel>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], _left: &[T],
-    width: usize, height: usize, _bit_depth: usize,
+    width: usize, height: usize,
   ) {
     let sum = above[..width].iter().fold(0u32, |acc, &v| {
       let v: u32 = v.into();
@@ -1051,9 +1041,9 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_cfl_inner<T: Pixel>(
+  pub(crate) fn pred_cfl_inner<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, width: usize,
-    height: usize, bit_depth: usize,
+    height: usize,
   ) {
     if alpha == 0 {
       return;
@@ -1063,7 +1053,7 @@ pub(crate) mod rust {
     assert!(output.plane_cfg.stride >= width);
     assert!(output.rows_iter().len() >= height);
 
-    let sample_max = (1 << bit_depth) - 1;
+    let sample_max = (1 << BD) - 1;
     let avg: i32 = output[0][0].into();
 
     for (line, luma) in
@@ -1077,43 +1067,43 @@ pub(crate) mod rust {
     }
   }
 
-  pub(crate) fn pred_cfl<T: Pixel>(
+  pub(crate) fn pred_cfl<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_128<T: Pixel>(
+  pub(crate) fn pred_cfl_128<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_128(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_128::<_, BD>(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_left<T: Pixel>(
+  pub(crate) fn pred_cfl_left<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_left(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_left(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
-  pub(crate) fn pred_cfl_top<T: Pixel>(
+  pub(crate) fn pred_cfl_top<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T],
-    left: &[T], width: usize, height: usize, bit_depth: usize,
+    left: &[T], width: usize, height: usize,
   ) {
-    pred_dc_top(output, above, left, width, height, bit_depth);
-    pred_cfl_inner(output, ac, alpha, width, height, bit_depth);
+    pred_dc_top(output, above, left, width, height);
+    pred_cfl_inner::<_, BD>(output, ac, alpha, width, height);
   }
 
   #[allow(clippy::clone_double_ref)]
-  pub(crate) fn pred_directional<T: Pixel>(
+  pub(crate) fn pred_directional<T: Pixel, const BD: usize>(
     output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T],
     top_left: &[T], p_angle: usize, width: usize, height: usize,
-    bit_depth: usize, ief_params: Option<IntraEdgeFilterParameters>,
+    ief_params: Option<IntraEdgeFilterParameters>,
   ) {
     #[allow(clippy::collapsible_if)]
     #[allow(clippy::collapsible_else_if)]
@@ -1223,7 +1213,7 @@ pub(crate) mod rust {
       edge.copy_from_slice(edge_filtered.as_slice());
     }
 
-    fn upsample_edge<T: Pixel>(size: usize, edge: &mut [T], bit_depth: usize) {
+    fn upsample_edge<T: Pixel, const BD: usize>(size: usize, edge: &mut [T]) {
       // The input edge should be valid in the -1..size range,
       // where the -1 index is the top-left edge pixel. Since
       // negative indices are unsafe in Rust, the caller is
@@ -1247,14 +1237,14 @@ pub(crate) mod rust {
           + (9 * dup[i + 1].to_i32().unwrap())
           + (9 * dup[i + 2].to_i32().unwrap())
           - dup[i + 3].to_i32().unwrap();
-        s = ((s + 8) / 16).clamp(0, (1 << bit_depth) - 1);
+        s = ((s + 8) / 16).clamp(0, (1 << BD) - 1);
 
         edge[2 * i + 1] = T::cast_from(s);
         edge[2 * i + 2] = dup[i + 2];
       }
     }
 
-    let sample_max = (1 << bit_depth) - 1;
+    let sample_max = (1 << BD) - 1;
 
     let max_x = output.plane_cfg.width as isize - 1;
     let max_y = output.plane_cfg.height as isize - 1;
@@ -1332,7 +1322,7 @@ pub(crate) mod rust {
         p_angle as isize - 90,
       );
       if upsample_above {
-        upsample_edge(num_px.0, above_filtered.as_mut_slice(), bit_depth);
+        upsample_edge::<_, BD>(num_px.0, &mut above_filtered[..]);
       }
       upsample_left = select_ief_upsample(
         width,
@@ -1341,7 +1331,7 @@ pub(crate) mod rust {
         p_angle as isize - 180,
       );
       if upsample_left {
-        upsample_edge(num_px.1, left_filtered.as_mut_slice(), bit_depth);
+        upsample_edge::<_, BD>(num_px.1, &mut left_filtered[..]);
       }
 
       left_filtered.reverse();
@@ -1509,16 +1499,16 @@ mod test {
 
     let mut output = Plane::from_slice(&[0u8; 4 * 4], 4);
 
-    pred_dc(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [32u8; 16]);
 
-    pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [35u8; 16]);
 
-    pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [30u8; 16]);
 
-    pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4, 8);
+    pred_dc_128::<_, 8>(&mut output.as_region_mut(), above, left, 4, 4);
     assert_eq!(&output.data[..], [128u8; 16]);
 
     pred_v(&mut output.as_region_mut(), above, 4, 4);
@@ -1594,7 +1584,7 @@ mod test {
       [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36],
     ];
     for (&angle, expected) in angles.iter().zip(expected.iter()) {
-      pred_directional(
+      pred_directional::<_, 8>(
         &mut output.as_region_mut(),
         above,
         left,
@@ -1602,7 +1592,6 @@ mod test {
         angle,
         4,
         4,
-        8,
         None,
       );
       assert_eq!(&output.data[..], expected);
@@ -1617,7 +1606,7 @@ mod test {
 
     let mut o = Plane::from_slice(&vec![0u16; 32 * 32], 32);
 
-    pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4, 16);
+    pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4);
 
     for l in o.data.chunks(32).take(4) {
       for v in l[..4].iter() {
diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs
index 72006361fb..2f6e2b103a 100644
--- a/src/quantize/mod.rs
+++ b/src/quantize/mod.rs
@@ -36,18 +36,24 @@ pub fn get_log_tx_scale(tx_size: TxSize) -> usize {
     + Into::<usize>::into(num_pixels > 1024)
 }
 
-pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
-  let dc_q: [&[NonZeroU16; 256]; 3] =
-    [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3];
-  let bd = ((bit_depth ^ 8) >> 1).min(2);
-  dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
+pub fn dc_q<const BD: usize>(qindex: u8, delta_q: i8) -> NonZeroU16 {
+  let dc_q = match BD {
+    8 => &dc_qlookup_Q3,
+    10 => &dc_qlookup_10_Q3,
+    12 => &dc_qlookup_12_Q3,
+    _ => unimplemented!(),
+  };
+  dc_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
 }
 
-pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
-  let ac_q: [&[NonZeroU16; 256]; 3] =
-    [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3];
-  let bd = ((bit_depth ^ 8) >> 1).min(2);
-  ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
+pub fn ac_q<const BD: usize>(qindex: u8, delta_q: i8) -> NonZeroU16 {
+  let ac_q = match BD {
+    8 => &ac_qlookup_Q3,
+    10 => &ac_qlookup_10_Q3,
+    12 => &ac_qlookup_12_Q3,
+    _ => unimplemented!(),
+  };
+  ac_q[((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
 }
 
 // TODO: Handle lossless properly.
@@ -78,8 +84,8 @@ fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 {
   }
 }
 
-pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
-  let qlookup = match bit_depth {
+pub fn select_dc_qi<const BD: usize>(quantizer: i64) -> u8 {
+  let qlookup = match BD {
     8 => &dc_qlookup_Q3,
     10 => &dc_qlookup_10_Q3,
     12 => &dc_qlookup_12_Q3,
@@ -88,8 +94,8 @@ pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
   select_qi(quantizer, qlookup)
 }
 
-pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 {
-  let qlookup = match bit_depth {
+pub fn select_ac_qi<const BD: usize>(quantizer: i64) -> u8 {
+  let qlookup = match BD {
     8 => &ac_qlookup_Q3,
     10 => &ac_qlookup_10_Q3,
     12 => &ac_qlookup_12_Q3,
@@ -218,16 +224,16 @@ mod test {
 }
 
 impl QuantizationContext {
-  pub fn update(
-    &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize,
-    dc_delta_q: i8, ac_delta_q: i8,
+  pub fn update<const BD: usize>(
+    &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, dc_delta_q: i8,
+    ac_delta_q: i8,
   ) {
     self.log_tx_scale = get_log_tx_scale(tx_size);
 
-    self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth);
+    self.dc_quant = dc_q::<BD>(qindex, dc_delta_q);
     self.dc_mul_add = divu_gen(self.dc_quant.into());
 
-    self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth);
+    self.ac_quant = ac_q::<BD>(qindex, ac_delta_q);
     self.ac_mul_add = divu_gen(self.ac_quant.into());
 
     // All of these biases were derived by measuring the cost of coding
@@ -352,15 +358,15 @@ pub mod rust {
   use super::*;
   use crate::cpu_features::CpuFeatureLevel;
 
-  pub fn dequantize<T: Coefficient>(
+  pub fn dequantize<T: Coefficient, const BD: usize>(
     qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-    bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
+    dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
   ) {
     let log_tx_scale = get_log_tx_scale(tx_size) as i32;
     let offset = (1 << log_tx_scale) - 1;
 
-    let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32;
-    let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32;
+    let dc_quant = dc_q::<BD>(qindex, dc_delta_q).get() as i32;
+    let ac_quant = ac_q::<BD>(qindex, ac_delta_q).get() as i32;
 
     for (i, (r, c)) in rcoeffs
       .iter_mut()
diff --git a/src/rate.rs b/src/rate.rs
index 3ae7b09e1b..da0d98d636 100644
--- a/src/rate.rs
+++ b/src/rate.rs
@@ -522,15 +522,14 @@ fn chroma_offset(
 }
 
 impl QuantizerParameters {
-  fn new_from_log_q(
-    log_base_q: i64, log_target_q: i64, bit_depth: usize,
-    chroma_sampling: ChromaSampling, is_intra: bool,
-    log_isqrt_mean_scale: i64,
+  fn new_from_log_q<const BD: usize>(
+    log_base_q: i64, log_target_q: i64, chroma_sampling: ChromaSampling,
+    is_intra: bool, log_isqrt_mean_scale: i64,
   ) -> QuantizerParameters {
-    let scale = log_isqrt_mean_scale + q57(QSCALE + bit_depth as i32 - 8);
+    let scale = log_isqrt_mean_scale + q57(QSCALE + BD as i32 - 8);
 
     let mut log_q_y = log_target_q;
-    if !is_intra && bit_depth == 8 {
+    if !is_intra && BD == 8 {
       log_q_y = log_target_q
         + (log_target_q >> 32) * Q_MODEL_MUL[chroma_sampling as usize]
         + Q_MODEL_ADD[chroma_sampling as usize];
@@ -552,7 +551,7 @@ impl QuantizerParameters {
     let scale = |q| bexp64((log_target_q - q) * 2 + q57(16)) as f64 / 65536.;
     let dist_scale = [scale(log_q_y), scale(log_q_u), scale(log_q_v)];
 
-    let base_q_idx = select_ac_qi(quantizer, bit_depth).max(1);
+    let base_q_idx = select_ac_qi::<BD>(quantizer).max(1);
 
     // delta_q only gets 6 bits + a sign bit, so it can differ by 63 at most.
     let min_qi = base_q_idx.saturating_sub(63).max(1);
@@ -564,14 +563,14 @@ impl QuantizerParameters {
       log_target_q,
       // TODO: Allow lossless mode; i.e. qi == 0.
       dc_qi: [
-        clamp_qi(select_dc_qi(quantizer, bit_depth)),
-        if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_u, bit_depth)) },
-        if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_v, bit_depth)) },
+        clamp_qi(select_dc_qi::<BD>(quantizer)),
+        if mono { 0 } else { clamp_qi(select_dc_qi::<BD>(quantizer_u)) },
+        if mono { 0 } else { clamp_qi(select_dc_qi::<BD>(quantizer_v)) },
       ],
       ac_qi: [
         base_q_idx,
-        if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_u, bit_depth)) },
-        if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_v, bit_depth)) },
+        if mono { 0 } else { clamp_qi(select_ac_qi::<BD>(quantizer_u)) },
+        if mono { 0 } else { clamp_qi(select_ac_qi::<BD>(quantizer_v)) },
       ],
       lambda,
       dist_scale,
@@ -701,17 +700,16 @@ impl RCState {
     }
   }
 
-  pub(crate) fn select_first_pass_qi(
-    &self, bit_depth: usize, fti: usize, chroma_sampling: ChromaSampling,
+  pub(crate) fn select_first_pass_qi<const BD: usize>(
+    &self, fti: usize, chroma_sampling: ChromaSampling,
   ) -> QuantizerParameters {
     // Adjust the quantizer for the frame type, result is Q57:
     let log_q = ((self.pass1_log_base_q + (1i64 << 11)) >> 12)
       * (MQP_Q12[fti] as i64)
       + DQP_Q57[fti];
-    QuantizerParameters::new_from_log_q(
+    QuantizerParameters::new_from_log_q::<BD>(
       self.pass1_log_base_q,
       log_q,
-      bit_depth,
       chroma_sampling,
       fti == 0,
       0,
@@ -719,7 +717,7 @@ impl RCState {
   }
 
   // TODO: Separate quantizers for Cb and Cr.
-  pub(crate) fn select_qi<T: Pixel>(
+  pub(crate) fn select_qi<T: Pixel, const BD: usize>(
     &self, ctx: &ContextInner<T>, output_frameno: u64, fti: usize,
     maybe_prev_log_base_q: Option<i64>, log_isqrt_mean_scale: i64,
   ) -> QuantizerParameters {
@@ -727,14 +725,12 @@ impl RCState {
     if self.target_bitrate <= 0 {
       // Rate control is not active.
       // Derive quantizer directly from frame type.
-      let bit_depth = ctx.config.bit_depth;
       let chroma_sampling = ctx.config.chroma_sampling;
       let (log_base_q, log_q) =
-        Self::calc_flat_quantizer(ctx.config.quantizer as u8, bit_depth, fti);
-      QuantizerParameters::new_from_log_q(
+        Self::calc_flat_quantizer::<BD>(ctx.config.quantizer as u8, fti);
+      QuantizerParameters::new_from_log_q::<BD>(
         log_base_q,
         log_q,
-        bit_depth,
         chroma_sampling,
         fti == 0,
         log_isqrt_mean_scale,
@@ -748,11 +744,8 @@ impl RCState {
       match self.twopass_state {
         // First pass of 2-pass mode: use a fixed base quantizer.
         PASS_1 => {
-          return self.select_first_pass_qi(
-            ctx.config.bit_depth,
-            fti,
-            ctx.config.chroma_sampling,
-          );
+          return self
+            .select_first_pass_qi::<BD>(fti, ctx.config.chroma_sampling);
         }
         // Second pass of 2-pass mode: we know exactly how much of each frame
         //  type there is in the current buffer window, and have estimates for
@@ -906,17 +899,16 @@ impl RCState {
       //  in the binary log domain (binary exp and log aren't too bad):
       //  rate = exp2(log2(scale) - log2(quantizer)*exp)
       // There's no easy closed form solution, so we bisection searh for it.
-      let bit_depth = ctx.config.bit_depth;
       let chroma_sampling = ctx.config.chroma_sampling;
       // TODO: Proper handling of lossless.
-      let mut log_qlo = blog64(ac_q(self.ac_qi_min, 0, bit_depth).get() as i64)
-        - q57(QSCALE + bit_depth as i32 - 8);
+      let mut log_qlo = blog64(ac_q::<BD>(self.ac_qi_min, 0).get() as i64)
+        - q57(QSCALE + BD as i32 - 8);
       // The AC quantizer tables map to values larger than the DC quantizer
       //  tables, so we use that as the upper bound to make sure we can use
       //  the full table if needed.
       let mut log_qhi = blog64(
-        ac_q(self.maybe_ac_qi_max.unwrap_or(255), 0, bit_depth).get() as i64,
-      ) - q57(QSCALE + bit_depth as i32 - 8);
+        ac_q::<BD>(self.maybe_ac_qi_max.unwrap_or(255), 0).get() as i64,
+      ) - q57(QSCALE + BD as i32 - 8);
       let mut log_base_q = (log_qlo + log_qhi) >> 1;
       while log_qlo < log_qhi {
         // Count bits contributed by each frame type using the model.
@@ -1020,20 +1012,19 @@ impl RCState {
 
       if let Some(qi_max) = self.maybe_ac_qi_max {
         let (max_log_base_q, max_log_q) =
-          Self::calc_flat_quantizer(qi_max, ctx.config.bit_depth, fti);
+          Self::calc_flat_quantizer::<BD>(qi_max, fti);
         log_base_q = cmp::min(log_base_q, max_log_base_q);
         log_q = cmp::min(log_q, max_log_q);
       }
       if self.ac_qi_min > 0 {
         let (min_log_base_q, min_log_q) =
-          Self::calc_flat_quantizer(self.ac_qi_min, ctx.config.bit_depth, fti);
+          Self::calc_flat_quantizer::<BD>(self.ac_qi_min, fti);
         log_base_q = cmp::max(log_base_q, min_log_base_q);
         log_q = cmp::max(log_q, min_log_q);
       }
-      QuantizerParameters::new_from_log_q(
+      QuantizerParameters::new_from_log_q::<BD>(
         log_base_q,
         log_q,
-        bit_depth,
         chroma_sampling,
         fti == 0,
         log_isqrt_mean_scale,
@@ -1043,8 +1034,8 @@ impl RCState {
 
   // Computes a quantizer directly from the frame type and base quantizer index,
   // without consideration for rate control.
-  fn calc_flat_quantizer(
-    base_qi: u8, bit_depth: usize, fti: usize,
+  fn calc_flat_quantizer<const BD: usize>(
+    base_qi: u8, fti: usize,
   ) -> (i64, i64) {
     // TODO: Rename "quantizer" something that indicates it is a quantizer
     //  index, and move it somewhere more sensible (or choose a better way to
@@ -1052,13 +1043,13 @@ impl RCState {
 
     // We use the AC quantizer as the source quantizer since its quantizer
     //  tables have unique entries, while the DC tables do not.
-    let ac_quantizer = ac_q(base_qi, 0, bit_depth).get() as i64;
+    let ac_quantizer = ac_q::<BD>(base_qi, 0).get() as i64;
     // Pick the nearest DC entry since an exact match may be unavailable.
-    let dc_qi = select_dc_qi(ac_quantizer, bit_depth);
-    let dc_quantizer = dc_q(dc_qi, 0, bit_depth).get() as i64;
+    let dc_qi = select_dc_qi::<BD>(ac_quantizer);
+    let dc_quantizer = dc_q::<BD>(dc_qi, 0).get() as i64;
     // Get the log quantizers as Q57.
-    let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + bit_depth as i32 - 8);
-    let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + bit_depth as i32 - 8);
+    let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + BD as i32 - 8);
+    let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + BD as i32 - 8);
     // Target the midpoint of the chosen entries.
     let log_base_q = (log_ac_q + log_dc_q + 1) >> 1;
     // Adjust the quantizer for the frame type, result is Q57:
@@ -1255,11 +1246,13 @@ impl RCState {
     cur_pos
   }
 
-  pub(crate) fn select_pass1_log_base_q<T: Pixel>(
+  pub(crate) fn select_pass1_log_base_q<T: Pixel, const BD: usize>(
     &self, ctx: &ContextInner<T>, output_frameno: u64,
   ) -> i64 {
     assert_eq!(self.twopass_state, PASS_SINGLE);
-    self.select_qi(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0).log_base_q
+    self
+      .select_qi::<_, BD>(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0)
+      .log_base_q
   }
 
   // Initialize the first pass and emit a placeholder summary
diff --git a/src/rdo.rs b/src/rdo.rs
index c92b383b92..1dfdba438d 100644
--- a/src/rdo.rs
+++ b/src/rdo.rs
@@ -139,9 +139,13 @@ pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
 }
 
 #[allow(unused)]
-pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
+pub fn cdef_dist_wxh<
+  T: Pixel,
+  F: Fn(Area, BlockSize) -> DistortionScale,
+  const BD: usize,
+>(
   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
-  bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel,
+  compute_bias: F, cpu: CpuFeatureLevel,
 ) -> Distortion {
   debug_assert!(src1.plane_cfg.xdec == 0);
   debug_assert!(src1.plane_cfg.ydec == 0);
@@ -155,12 +159,11 @@ pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
       let kernel_w = (w - x).min(8);
       let area = Area::StartingAt { x: x as isize, y: y as isize };
 
-      let value = RawDistortion(cdef_dist_kernel(
+      let value = RawDistortion(cdef_dist_kernel::<_, BD>(
         &src1.subregion(area),
         &src2.subregion(area),
         kernel_w,
         kernel_h,
-        bit_depth,
         cpu,
       ) as u64);
 
@@ -174,9 +177,13 @@ pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
 
 /// Sum of Squared Error for a wxh block
 /// Currently limited to w and h of valid blocks
-pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
+pub fn sse_wxh<
+  T: Pixel,
+  F: Fn(Area, BlockSize) -> DistortionScale,
+  const BD: usize,
+>(
   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
-  compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
+  compute_bias: F, cpu: CpuFeatureLevel,
 ) -> Distortion {
   // See get_weighted_sse in src/dist.rs.
   // Provide a scale to get_weighted_sse for each square region of this size.
@@ -218,9 +225,7 @@ pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
     }
   }
 
-  Distortion(get_weighted_sse(
-    src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
-  ))
+  Distortion(get_weighted_sse(src1, src2, buf, buf_stride, w, h, cpu))
 }
 
 pub const fn clip_visible_bsize(
@@ -249,7 +254,7 @@ pub const fn clip_visible_bsize(
 }
 
 // Compute the pixel-domain distortion for an encode
-fn compute_distortion<T: Pixel>(
+fn compute_distortion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
   is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
 ) -> ScaledDistortion {
@@ -272,12 +277,11 @@ fn compute_distortion<T: Pixel>(
   }
 
   let mut distortion = match fi.config.tune {
-    Tune::Psychovisual => cdef_dist_wxh(
+    Tune::Psychovisual => cdef_dist_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
       visible_h,
-      fi.sequence.bit_depth,
       |bias_area, bsize| {
         distortion_scale(
           fi,
@@ -287,7 +291,7 @@ fn compute_distortion<T: Pixel>(
       },
       fi.cpu_feature_level,
     ),
-    Tune::Psnr => sse_wxh(
+    Tune::Psnr => sse_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
@@ -299,7 +303,6 @@ fn compute_distortion<T: Pixel>(
           bsize,
         )
       },
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     ),
   } * fi.dist_scale[0];
@@ -323,7 +326,7 @@ fn compute_distortion<T: Pixel>(
     for p in 1..3 {
       let input_region = ts.input_tile.planes[p].subregion(area);
       let rec_region = ts.rec.planes[p].subregion(area);
-      distortion += sse_wxh(
+      distortion += sse_wxh::<_, _, BD>(
         &input_region,
         &rec_region,
         chroma_w,
@@ -335,7 +338,6 @@ fn compute_distortion<T: Pixel>(
             bsize,
           )
         },
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       ) * fi.dist_scale[p];
     }
@@ -344,7 +346,7 @@ fn compute_distortion<T: Pixel>(
 }
 
 // Compute the transform-domain distortion for an encode
-fn compute_tx_distortion<T: Pixel>(
+fn compute_tx_distortion<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
   is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
   skip: bool, luma_only: bool,
@@ -372,7 +374,7 @@ fn compute_tx_distortion<T: Pixel>(
   }
 
   let mut distortion = if skip {
-    sse_wxh(
+    sse_wxh::<_, _, BD>(
       &input_region,
       &rec_region,
       visible_w,
@@ -384,7 +386,6 @@ fn compute_tx_distortion<T: Pixel>(
           bsize,
         )
       },
-      fi.sequence.bit_depth,
       fi.cpu_feature_level,
     ) * fi.dist_scale[0]
   } else {
@@ -411,7 +412,7 @@ fn compute_tx_distortion<T: Pixel>(
     for p in 1..3 {
       let input_region = ts.input_tile.planes[p].subregion(area);
       let rec_region = ts.rec.planes[p].subregion(area);
-      distortion += sse_wxh(
+      distortion += sse_wxh::<_, _, BD>(
         &input_region,
         &rec_region,
         chroma_w,
@@ -423,7 +424,6 @@ fn compute_tx_distortion<T: Pixel>(
             bsize,
           )
         },
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       ) * fi.dist_scale[p];
     }
@@ -720,7 +720,7 @@ pub fn compute_rd_cost<T: Pixel>(
   fi.lambda.mul_add(rate_in_bits, distortion.0 as f64)
 }
 
-pub fn rdo_tx_size_type<T: Pixel>(
+pub fn rdo_tx_size_type<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -759,7 +759,7 @@ pub fn rdo_tx_size_type<T: Pixel>(
       if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
 
     // Luma plane transform type decision
-    let (tx_type, rd_cost) = rdo_tx_type_decision(
+    let (tx_type, rd_cost) = rdo_tx_type_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -810,7 +810,7 @@ const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
 }
 
 #[inline]
-fn luma_chroma_mode_rdo<T: Pixel>(
+fn luma_chroma_mode_rdo<T: Pixel, const BD: usize>(
   luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
   tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, rdo_type: RDOType,
@@ -857,7 +857,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
     for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
       cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
 
-      let (tx_size, tx_type) = rdo_tx_size_type(
+      let (tx_size, tx_type) = rdo_tx_size_type::<_, BD>(
         fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
       );
       for &chroma_mode in mode_set_chroma.iter() {
@@ -878,7 +878,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
           luma_mode_is_intra && tx_size.block_size() != bsize;
 
         encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
-        let (has_coeff, tx_dist) = encode_block_post_cdef(
+        let (has_coeff, tx_dist) = encode_block_post_cdef::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -903,7 +903,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
 
         let rate = wr.tell_frac() - tell;
         let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
-          compute_tx_distortion(
+          compute_tx_distortion::<_, BD>(
             fi,
             ts,
             bsize,
@@ -914,7 +914,14 @@ fn luma_chroma_mode_rdo<T: Pixel>(
             false,
           )
         } else {
-          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
+          compute_distortion::<_, BD>(
+            fi,
+            ts,
+            bsize,
+            is_chroma_block,
+            tile_bo,
+            false,
+          )
         };
         let is_zero_dist = distortion.0 == 0;
         let rd = compute_rd_cost(fi, rate, distortion);
@@ -956,7 +963,7 @@ fn luma_chroma_mode_rdo<T: Pixel>(
 ///
 /// - If the best RD found is negative.
 ///   This should never happen and indicates a development error.
-pub fn rdo_mode_decision<T: Pixel>(
+pub fn rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig,
@@ -975,7 +982,7 @@ pub fn rdo_mode_decision<T: Pixel>(
   let mut best = if fi.frame_type.has_inter() {
     assert!(fi.frame_type != FrameType::KEY);
 
-    inter_frame_rdo_mode_decision(
+    inter_frame_rdo_mode_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -993,7 +1000,7 @@ pub fn rdo_mode_decision<T: Pixel>(
     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
 
   if !best.skip {
-    best = intra_frame_rdo_mode_decision(
+    best = intra_frame_rdo_mode_decision::<_, BD>(
       fi,
       ts,
       cw,
@@ -1014,7 +1021,7 @@ pub fn rdo_mode_decision<T: Pixel>(
     let mut wr = WriterCounter::new();
     let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
 
-    write_tx_blocks(
+    write_tx_blocks::<_, _, BD>(
       fi,
       ts,
       cw,
@@ -1034,7 +1041,9 @@ pub fn rdo_mode_decision<T: Pixel>(
     );
     cw.rollback(&cw_checkpoint);
     if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
-      if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
+      if let Some(cfl) =
+        rdo_cfl_alpha::<_, BD>(ts, tile_bo, bsize, best.tx_size, fi)
+      {
         let mut wr = WriterCounter::new();
         let tell = wr.tell_frac();
 
@@ -1047,7 +1056,7 @@ pub fn rdo_mode_decision<T: Pixel>(
           tile_bo,
           best.skip,
         );
-        let (has_coeff, _) = encode_block_post_cdef(
+        let (has_coeff, _) = encode_block_post_cdef::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -1073,8 +1082,14 @@ pub fn rdo_mode_decision<T: Pixel>(
         let rate = wr.tell_frac() - tell;
 
         // For CFL, tx-domain distortion is not an option.
-        let distortion =
-          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
+        let distortion = compute_distortion::<_, BD>(
+          fi,
+          ts,
+          bsize,
+          is_chroma_block,
+          tile_bo,
+          false,
+        );
         let rd = compute_rd_cost(fi, rate, distortion);
         if rd < best.rd_cost {
           best.rd_cost = rd;
@@ -1113,7 +1128,7 @@ pub fn rdo_mode_decision<T: Pixel>(
   }
 }
 
-fn inter_frame_rdo_mode_decision<T: Pixel>(
+fn inter_frame_rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
@@ -1175,7 +1190,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
       pmv[1] = mv_stack[1].this_mv;
     }
 
-    let res = estimate_motion(
+    let res = estimate_motion::<_, BD>(
       fi,
       ts,
       bsize.width(),
@@ -1320,7 +1335,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
       let mut rec_region =
         rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
 
-      luma_mode.predict_inter(
+      luma_mode.predict_inter::<_, BD>(
         fi,
         tile_rect,
         0,
@@ -1337,12 +1352,11 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
         .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
       let plane_ref = rec_region.as_const();
 
-      let satd = get_satd(
+      let satd = get_satd::<_, BD>(
         &plane_org,
         &plane_ref,
         bsize.width(),
         bsize.height(),
-        fi.sequence.bit_depth,
         fi.cpu_feature_level,
       );
       satds.push(satd);
@@ -1361,7 +1375,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
     |&((luma_mode, i), mvs, _satd)| {
       let mode_set_chroma = ArrayVec::from([luma_mode]);
 
-      luma_chroma_mode_rdo(
+      luma_chroma_mode_rdo::<_, BD>(
         luma_mode,
         fi,
         bsize,
@@ -1385,7 +1399,7 @@ fn inter_frame_rdo_mode_decision<T: Pixel>(
   best
 }
 
-fn intra_frame_rdo_mode_decision<T: Pixel>(
+fn intra_frame_rdo_mode_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
@@ -1432,7 +1446,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
         let rec = &ts.rec.planes[0].as_const();
         let po = tile_bo.plane_offset(rec.plane_cfg);
         // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
-        get_intra_edges(
+        get_intra_edges::<_, BD>(
           rec,
           tile_bo,
           0,
@@ -1440,7 +1454,6 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
           bsize,
           po,
           tx_size,
-          fi.sequence.bit_depth,
           None,
           fi.sequence.enable_intra_edge_filter,
           IntraParam::None,
@@ -1466,11 +1479,10 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
         let mut rec_region =
           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
         // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
-        luma_mode.predict_intra(
+        luma_mode.predict_intra::<_, BD>(
           tile_rect,
           &mut rec_region,
           tx_size,
-          fi.sequence.bit_depth,
           &[0i16; 2],
           IntraParam::None,
           if luma_mode.is_directional() { ief_params } else { None },
@@ -1482,12 +1494,11 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
           .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
         let plane_ref = rec_region.as_const();
 
-        satds_all[luma_mode as usize] = get_satd(
+        satds_all[luma_mode as usize] = get_satd::<_, BD>(
           &plane_org,
           &plane_ref,
           tx_size.width(),
           tx_size.height(),
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         );
       }
@@ -1507,7 +1518,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
     if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
       mode_set_chroma.push(PredictionMode::DC_PRED);
     }
-    luma_chroma_mode_rdo(
+    luma_chroma_mode_rdo::<_, BD>(
       luma_mode,
       fi,
       bsize,
@@ -1541,7 +1552,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
     let mut best_angle_delta = best.angle_delta;
     let mut angle_delta_rdo = |y, uv| -> AngleDelta {
       if best.angle_delta.y != y || best.angle_delta.uv != uv {
-        luma_chroma_mode_rdo(
+        luma_chroma_mode_rdo::<_, BD>(
           best.pred_mode_luma,
           fi,
           bsize,
@@ -1581,7 +1592,7 @@ fn intra_frame_rdo_mode_decision<T: Pixel>(
 /// # Panics
 ///
 /// - If the block size is invalid for subsampling.
-pub fn rdo_cfl_alpha<T: Pixel>(
+pub fn rdo_cfl_alpha<T: Pixel, const BD: usize>(
   ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
   luma_tx_size: TxSize, fi: &FrameInvariants<T>,
 ) -> Option<CFLParams> {
@@ -1613,7 +1624,7 @@ pub fn rdo_cfl_alpha<T: Pixel>(
       let rec = &mut ts.rec.planes[p];
       let input = &ts.input_tile.planes[p];
       let po = tile_bo.plane_offset(rec.plane_cfg);
-      let edge_buf = get_intra_edges(
+      let edge_buf = get_intra_edges::<_, BD>(
         &rec.as_const(),
         tile_bo,
         0,
@@ -1621,7 +1632,6 @@ pub fn rdo_cfl_alpha<T: Pixel>(
         bsize,
         po,
         uv_tx_size,
-        fi.sequence.bit_depth,
         Some(PredictionMode::UV_CFL_PRED),
         fi.sequence.enable_intra_edge_filter,
         IntraParam::None,
@@ -1629,24 +1639,22 @@ pub fn rdo_cfl_alpha<T: Pixel>(
       let mut alpha_cost = |alpha: i16| -> u64 {
         let mut rec_region =
           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
-        PredictionMode::UV_CFL_PRED.predict_intra(
+        PredictionMode::UV_CFL_PRED.predict_intra::<_, BD>(
           tile_rect,
           &mut rec_region,
           uv_tx_size,
-          fi.sequence.bit_depth,
           &ac.data,
           IntraParam::Alpha(alpha),
           None,
           &edge_buf,
           fi.cpu_feature_level,
         );
-        sse_wxh(
+        sse_wxh::<_, _, BD>(
           &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
           &rec_region.as_const(),
           visible_tx_w,
           visible_tx_h,
           |_, _| DistortionScale::default(), // We're not doing RDO here.
-          fi.sequence.bit_depth,
           fi.cpu_feature_level,
         )
         .0
@@ -1688,7 +1696,7 @@ pub fn rdo_cfl_alpha<T: Pixel>(
 ///   This should never happen and indicates a development error.
 /// - If the best RD found is negative.
 ///   This should never happen and indicates a development error.
-pub fn rdo_tx_type_decision<T: Pixel>(
+pub fn rdo_tx_type_decision<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
   mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
@@ -1726,7 +1734,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
     }
 
     if is_inter {
-      motion_compensate(
+      motion_compensate::<_, BD>(
         fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
       );
     }
@@ -1734,7 +1742,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
     let mut wr = WriterCounter::new();
     let tell = wr.tell_frac();
     let (_, tx_dist) = if is_inter {
-      write_tx_tree(
+      write_tx_tree::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1751,7 +1759,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
         need_recon_pixel,
       )
     } else {
-      write_tx_blocks(
+      write_tx_blocks::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1773,7 +1781,7 @@ pub fn rdo_tx_type_decision<T: Pixel>(
 
     let rate = wr.tell_frac() - tell;
     let distortion = if fi.use_tx_domain_distortion {
-      compute_tx_distortion(
+      compute_tx_distortion::<_, BD>(
         fi,
         ts,
         bsize,
@@ -1784,7 +1792,14 @@ pub fn rdo_tx_type_decision<T: Pixel>(
         true,
       )
     } else {
-      compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
+      compute_distortion::<_, BD>(
+        fi,
+        ts,
+        bsize,
+        is_chroma_block,
+        tile_bo,
+        true,
+      )
     };
     cw.rollback(cw_checkpoint.as_ref().unwrap());
 
@@ -1836,14 +1851,14 @@ pub fn get_sub_partitions(
 }
 
 #[inline(always)]
-fn rdo_partition_none<T: Pixel>(
+fn rdo_partition_none<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
   inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
 ) -> f64 {
   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
 
-  let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
+  let mode = rdo_mode_decision::<_, BD>(fi, ts, cw, bsize, tile_bo, inter_cfg);
   let cost = mode.rd_cost;
 
   child_modes.push(mode);
@@ -1853,7 +1868,7 @@ fn rdo_partition_none<T: Pixel>(
 
 // VERTICAL, HORIZONTAL or simple SPLIT
 #[inline(always)]
-fn rdo_partition_simple<T: Pixel, W: Writer>(
+fn rdo_partition_simple<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
@@ -1895,7 +1910,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
 
     if has_cols && has_rows {
       let mode_decision =
-        rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
+        rdo_mode_decision::<_, BD>(fi, ts, cw, subsize, offset, inter_cfg);
 
       rd_cost_sum += mode_decision.rd_cost;
 
@@ -1907,7 +1922,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
           if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
         cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
       }
-      encode_block_with_modes(
+      encode_block_with_modes::<_, _, BD>(
         fi,
         ts,
         cw,
@@ -1935,7 +1950,7 @@ fn rdo_partition_simple<T: Pixel, W: Writer>(
 ///
 /// - If the best RD found is negative.
 ///   This should never happen, and indicates a development error.
-pub fn rdo_partition_decision<T: Pixel, W: Writer>(
+pub fn rdo_partition_decision<T: Pixel, W: Writer, const BD: usize>(
   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
   bsize: BlockSize, tile_bo: TileBlockOffset,
@@ -1960,7 +1975,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
 
     let cost = match partition {
       PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
-        Some(rdo_partition_none(
+        Some(rdo_partition_none::<_, BD>(
           fi,
           ts,
           cw,
@@ -1971,7 +1986,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
         ))
       }
       PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
-        rdo_partition_simple(
+        rdo_partition_simple::<_, _, BD>(
           fi,
           ts,
           cw,
@@ -2012,7 +2027,7 @@ pub fn rdo_partition_decision<T: Pixel, W: Writer>(
   }
 }
 
-fn rdo_loop_plane_error<T: Pixel>(
+fn rdo_loop_plane_error<T: Pixel, const BD: usize>(
   base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
   sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
   blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
@@ -2054,23 +2069,21 @@ fn rdo_loop_plane_error<T: Pixel>(
           // For loop filters, We intentionally use cdef_dist even with
           // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
           // significant negative impact on other metrics and visual quality.
-          RawDistortion(cdef_dist_kernel(
+          RawDistortion(cdef_dist_kernel::<_, BD>(
             &src_region,
             &test_region,
             8,
             8,
-            fi.sequence.bit_depth,
             fi.cpu_feature_level,
           ) as u64)
             * bias
         } else {
-          sse_wxh(
+          sse_wxh::<_, _, BD>(
             &src_region,
             &test_region,
             8 >> xdec,
             8 >> ydec,
             |_, _| bias,
-            fi.sequence.bit_depth,
             fi.cpu_feature_level,
           )
         };
@@ -2088,7 +2101,7 @@ fn rdo_loop_plane_error<T: Pixel>(
 /// # Panics
 ///
 /// - If both CDEF and LRF are disabled.
-pub fn rdo_loop_decision<T: Pixel, W: Writer>(
+pub fn rdo_loop_decision<T: Pixel, W: Writer, const BD: usize>(
   base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
   ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
   deblock_p: bool,
@@ -2285,7 +2298,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
     // Find a good deblocking filter solution for the passed in area.
     // This is not RDO of deblocking itself, merely a solution to get
     // better results from CDEF/LRF RDO.
-    let deblock_levels = deblock_filter_optimize(
+    let deblock_levels = deblock_filter_optimize::<_, _, BD>(
       fi,
       &rec_subset.as_tile(),
       &src_subset,
@@ -2301,13 +2314,12 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
       deblock_copy.levels = deblock_levels;
 
       // finally, deblock the temp frame
-      deblock_filter_frame(
+      deblock_filter_frame::<_, BD>(
         &deblock_copy,
         &mut rec_subset.as_tile_mut(),
         &tileblocks_subset.as_const(),
         crop_w,
         crop_h,
-        fi.sequence.bit_depth,
         planes,
       );
     }
@@ -2335,7 +2347,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
     if cdef_work.is_some() {
       Some((
         &rec_subset,
-        cdef_analyze_superblock_range(
+        cdef_analyze_superblock_range::<_, BD>(
           fi,
           &rec_subset,
           &tileblocks_subset.as_const(),
@@ -2382,7 +2394,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
             let mut err = ScaledDistortion::zero();
             let mut rate = 0;
 
-            cdef_filter_superblock(
+            cdef_filter_superblock::<_, BD>(
               fi,
               &rec_subset,
               &mut cdef_ref.as_tile_mut(),
@@ -2418,7 +2430,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 // We have a valid LRU, apply LRF, compute error
                 match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
                   RestorationFilter::None {} => {
-                    err += rdo_loop_plane_error(
+                    err += rdo_loop_plane_error::<_, BD>(
                       base_sbo,
                       loop_sbo,
                       1,
@@ -2459,7 +2471,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                       &cdef_ref.planes[pli].slice(loop_po),
                       &cdef_ref.planes[pli].slice(loop_po),
                     );
-                    sgrproj_stripe_filter(
+                    sgrproj_stripe_filter::<_, _, BD>(
                       set,
                       xqd,
                       fi,
@@ -2473,7 +2485,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                         height: vis_height,
                       }),
                     );
-                    err += rdo_loop_plane_error(
+                    err += rdo_loop_plane_error::<_, BD>(
                       base_sbo,
                       loop_sbo,
                       1,
@@ -2496,7 +2508,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 }
               } else {
                 // No actual LRU here, compute error directly from CDEF output.
-                err += rdo_loop_plane_error(
+                err += rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   1,
@@ -2540,7 +2552,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
           // Keep cdef output up to date; we need it for restoration
           // both below and above (padding)
-          cdef_filter_superblock(
+          cdef_filter_superblock::<_, BD>(
             fi,
             rec_copy,
             &mut cdef_ref_tm,
@@ -2605,7 +2617,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
               // Check the no filter option
               {
-                let err = rdo_loop_plane_error(
+                let err = rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   lru_sb_w,
@@ -2660,7 +2672,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
 
               for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
               {
-                let (xqd0, xqd1) = sgrproj_solve(
+                let (xqd0, xqd1) = sgrproj_solve::<_, BD>(
                   set,
                   fi,
                   &ts.integral_buffer,
@@ -2673,7 +2685,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                 let current_lrf =
                   RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
                 if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
-                  sgrproj_stripe_filter(
+                  sgrproj_stripe_filter::<_, _, BD>(
                     set,
                     xqd,
                     fi,
@@ -2688,7 +2700,7 @@ pub fn rdo_loop_decision<T: Pixel, W: Writer>(
                     }),
                   );
                 }
-                let err = rdo_loop_plane_error(
+                let err = rdo_loop_plane_error::<_, BD>(
                   base_sbo,
                   loop_sbo,
                   lru_sb_w,
diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs
index 7414f09d3b..037ac8aadd 100644
--- a/src/scenechange/mod.rs
+++ b/src/scenechange/mod.rs
@@ -86,8 +86,6 @@ pub struct SceneChangeDetector<T: Pixel> {
   score_deque: Vec<ScenecutResult>,
   /// Number of pixels in scaled frame for fast mode
   pixels: usize,
-  /// The bit depth of the video.
-  bit_depth: usize,
   /// The CPU feature level to be used.
   cpu_feature_level: CpuFeatureLevel,
   encoder_config: EncoderConfig,
@@ -147,7 +145,6 @@ impl<T: Pixel> SceneChangeDetector<T> {
       deque_offset,
       score_deque,
       pixels,
-      bit_depth,
       cpu_feature_level,
       encoder_config,
       sequence,
@@ -165,7 +162,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   ///
   /// This will gracefully handle the first frame in the video as well.
   #[hawktracer(analyze_next_frame)]
-  pub fn analyze_next_frame(
+  pub fn analyze_next_frame<const BD: usize>(
     &mut self, frame_set: &[&Arc<Frame<T>>], input_frameno: u64,
     previous_keyframe: u64,
   ) -> bool {
@@ -196,9 +193,13 @@ impl<T: Pixel> SceneChangeDetector<T> {
       && frame_set.len() > self.deque_offset + 1
       && self.score_deque.is_empty()
     {
-      self.initialize_score_deque(frame_set, input_frameno, self.deque_offset);
+      self.initialize_score_deque::<BD>(
+        frame_set,
+        input_frameno,
+        self.deque_offset,
+      );
     } else if self.score_deque.is_empty() {
-      self.initialize_score_deque(
+      self.initialize_score_deque::<BD>(
         frame_set,
         input_frameno,
         frame_set.len() - 1,
@@ -209,7 +210,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     // Running single frame comparison and adding it to deque
     // Decrease deque offset if there is no new frames
     if frame_set.len() > self.deque_offset + 1 {
-      self.run_comparison(
+      self.run_comparison::<BD>(
         frame_set[self.deque_offset].clone(),
         frame_set[self.deque_offset + 1].clone(),
         input_frameno + self.deque_offset as u64,
@@ -219,7 +220,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     }
 
     // Adaptive scenecut check
-    let (scenecut, score) = self.adaptive_scenecut();
+    let (scenecut, score) = self.adaptive_scenecut::<BD>();
     let scenecut = self.handle_min_max_intervals(distance).unwrap_or(scenecut);
     debug!(
       "[SC-Detect] Frame {}: Raw={:5.1}  ImpBl={:5.1}  Bwd={:5.1}  Fwd={:5.1}  Th={:.1}  {}",
@@ -253,12 +254,12 @@ impl<T: Pixel> SceneChangeDetector<T> {
   }
 
   // Initially fill score deque with frame scores
-  fn initialize_score_deque(
+  fn initialize_score_deque<const BD: usize>(
     &mut self, frame_set: &[&Arc<Frame<T>>], input_frameno: u64,
     init_len: usize,
   ) {
     for x in 0..init_len {
-      self.run_comparison(
+      self.run_comparison::<BD>(
         frame_set[x].clone(),
         frame_set[x + 1].clone(),
         input_frameno + x as u64,
@@ -268,14 +269,14 @@ impl<T: Pixel> SceneChangeDetector<T> {
 
   /// Runs scene change comparison beetween 2 given frames
   /// Insert result to start of score deque
-  fn run_comparison(
+  fn run_comparison<const BD: usize>(
     &mut self, frame1: Arc<Frame<T>>, frame2: Arc<Frame<T>>,
     input_frameno: u64,
   ) {
     let mut result = if self.speed_mode == SceneDetectionSpeed::Fast {
       self.fast_scenecut(frame1, frame2)
     } else {
-      self.cost_scenecut(frame1, frame2, input_frameno)
+      self.cost_scenecut::<BD>(frame1, frame2, input_frameno)
     };
 
     // Subtract the highest metric value of surrounding frames from the current one
@@ -322,7 +323,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   /// Compares current scene score to adapted threshold based on previous scores
   /// Value of current frame is offset by lookahead, if lookahead >=5
   /// Returns true if current scene score is higher than adapted threshold
-  fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) {
+  fn adaptive_scenecut<const BD: usize>(&mut self) -> (bool, ScenecutResult) {
     let score = self.score_deque[self.deque_offset];
 
     // We use the importance block algorithm's cost metrics as a secondary algorithm
@@ -333,8 +334,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
     // the importance block algorithm is over the threshold either on this frame (hard scenecut)
     // or within the past few frames (pan). This helps filter out a few false positives
     // produced by the cost-based algorithm.
-    let imp_block_threshold =
-      IMP_BLOCK_DIFF_THRESHOLD * (self.bit_depth as f64) / 8.0;
+    let imp_block_threshold = IMP_BLOCK_DIFF_THRESHOLD * (BD as f64) / 8.0;
     if !&self.score_deque[self.deque_offset..]
       .iter()
       .any(|result| result.imp_block_cost >= imp_block_threshold)
diff --git a/src/scenechange/standard.rs b/src/scenechange/standard.rs
index 1f058271df..2f452164e9 100644
--- a/src/scenechange/standard.rs
+++ b/src/scenechange/standard.rs
@@ -18,7 +18,7 @@ impl<T: Pixel> SceneChangeDetector<T> {
   /// We gather both intra and inter costs for the frames,
   /// as well as an importance-block-based difference,
   /// and use all three metrics.
-  pub(super) fn cost_scenecut(
+  pub(super) fn cost_scenecut<const BD: usize>(
     &mut self, frame1: Arc<Frame<T>>, frame2: Arc<Frame<T>>,
     input_frameno: u64,
   ) -> ScenecutResult {
@@ -49,10 +49,9 @@ impl<T: Pixel> SceneChangeDetector<T> {
 
         let intra_costs =
           self.intra_costs.entry(input_frameno).or_insert_with(|| {
-            estimate_intra_costs(
+            estimate_intra_costs::<_, BD>(
               temp_plane,
               &*frame2,
-              self.bit_depth,
               self.cpu_feature_level,
             )
           });
@@ -67,10 +66,9 @@ impl<T: Pixel> SceneChangeDetector<T> {
         };
       });
       s.spawn(|_| {
-        mv_inter_cost = estimate_inter_costs(
+        mv_inter_cost = estimate_inter_costs::<_, BD>(
           frame2_inter_ref,
           frame1,
-          self.bit_depth,
           self.encoder_config.clone(),
           self.sequence.clone(),
           buffer,
diff --git a/src/segmentation.rs b/src/segmentation.rs
index 36ee42fb1c..776d90c265 100644
--- a/src/segmentation.rs
+++ b/src/segmentation.rs
@@ -19,7 +19,7 @@ use crate::FrameState;
 
 pub const MAX_SEGMENTS: usize = 8;
 
-pub fn segmentation_optimize<T: Pixel>(
+pub fn segmentation_optimize<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>,
 ) {
   assert!(fi.enable_segmentation);
@@ -50,11 +50,11 @@ pub fn segmentation_optimize<T: Pixel>(
       }
       assert_ne!(min_segment, MAX_SEGMENTS);
       fs.segmentation.min_segment = min_segment as u8;
-      fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth);
+      fs.segmentation.update_threshold::<BD>(fi.base_q_idx);
       return;
     }
 
-    segmentation_optimize_inner(fi, fs, offset_lower_limit);
+    segmentation_optimize_inner::<_, BD>(fi, fs, offset_lower_limit);
 
     /* Figure out parameters */
     fs.segmentation.preskip = false;
@@ -73,7 +73,7 @@ pub fn segmentation_optimize<T: Pixel>(
 }
 
 // Select target quantizers for each segment by fitting to log(scale).
-fn segmentation_optimize_inner<T: Pixel>(
+fn segmentation_optimize_inner<T: Pixel, const BD: usize>(
   fi: &FrameInvariants<T>, fs: &mut FrameState<T>, offset_lower_limit: i16,
 ) {
   use crate::quantize::{ac_q, select_ac_qi};
@@ -112,8 +112,7 @@ fn segmentation_optimize_inner<T: Pixel>(
   // See `distortion_scale_for` for more information.
   let compute_delta = |centroids: &[i16]| {
     use crate::util::{bexp64, blog64};
-    let log2_base_ac_q_q57 =
-      blog64(ac_q(fi.base_q_idx, 0, fi.config.bit_depth).get().into());
+    let log2_base_ac_q_q57 = blog64(ac_q::<BD>(fi.base_q_idx, 0).get().into());
     centroids
       .iter()
       .rev()
@@ -128,8 +127,7 @@ fn segmentation_optimize_inner<T: Pixel>(
       // and take the delta from the base quantizer index.
       .map(|q| {
         // Avoid going into lossless mode by never bringing qidx below 1.
-        select_ac_qi(q, fi.config.bit_depth).max(1) as i16
-          - fi.base_q_idx as i16
+        select_ac_qi::<BD>(q).max(1) as i16 - fi.base_q_idx as i16
       })
       .collect::<ArrayVec<_, MAX_SEGMENTS>>()
   };
@@ -155,7 +153,7 @@ fn segmentation_optimize_inner<T: Pixel>(
     data[SegLvl::SEG_LVL_ALT_Q as usize] = delta.max(offset_lower_limit);
   }
 
-  fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth);
+  fs.segmentation.update_threshold::<BD>(fi.base_q_idx);
 }
 
 pub fn select_segment<T: Pixel>(
diff --git a/src/transform/forward.rs b/src/transform/forward.rs
index ac9f4e850b..d50ad8435a 100644
--- a/src/transform/forward.rs
+++ b/src/transform/forward.rs
@@ -98,9 +98,9 @@ pub mod rust {
   ///
   /// - If called with an invalid combination of `tx_size` and `tx_type`
   #[cold_for_target_arch("x86_64")]
-  pub fn forward_transform<T: Coefficient>(
+  pub fn forward_transform<T: Coefficient, const BD: usize>(
     input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-    tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
+    tx_type: TxType, _cpu: CpuFeatureLevel,
   ) {
     assert!(valid_av1_transform(tx_size, tx_type));
 
@@ -117,7 +117,7 @@ pub mod rust {
     let mut tmp: Aligned<[i32; 64 * 64]> = unsafe { Aligned::uninitialized() };
     let buf = &mut tmp.data[..txfm_size_col * txfm_size_row];
 
-    let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
+    let cfg = Txfm2DFlipCfg::fwd::<BD>(tx_type, tx_size);
 
     let txfm_func_col = get_func(cfg.txfm_type_col);
     let txfm_func_row = get_func(cfg.txfm_type_row);
diff --git a/src/transform/forward_shared.rs b/src/transform/forward_shared.rs
index 2c818fb89e..221b99f4d3 100644
--- a/src/transform/forward_shared.rs
+++ b/src/transform/forward_shared.rs
@@ -119,7 +119,7 @@ impl Txfm2DFlipCfg {
   /// # Panics
   ///
   /// - If called with an invalid combination of `tx_size` and `tx_type`
-  pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self {
+  pub fn fwd<const BD: usize>(tx_type: TxType, tx_size: TxSize) -> Self {
     let tx_type_1d_col = VTX_TAB[tx_type as usize];
     let tx_type_1d_row = HTX_TAB[tx_type as usize];
     let txw_idx = tx_size.width_index();
@@ -134,7 +134,7 @@ impl Txfm2DFlipCfg {
       tx_size,
       ud_flip,
       lr_flip,
-      shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2],
+      shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(BD - 8) / 2],
       txfm_type_col,
       txfm_type_row,
     }
diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs
index cfe136352b..54c0a2d575 100644
--- a/src/transform/inverse.rs
+++ b/src/transform/inverse.rs
@@ -1602,9 +1602,9 @@ pub(crate) mod rust {
   use std::cmp;
 
   #[cold_for_target_arch("x86_64", "aarch64")]
-  pub fn inverse_transform_add<T: Pixel>(
+  pub fn inverse_transform_add<T: Pixel, const BD: usize>(
     input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
-    tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
+    tx_size: TxSize, tx_type: TxType, _cpu: CpuFeatureLevel,
   ) {
     let width: usize = tx_size.width();
     let height: usize = tx_size.height();
@@ -1619,7 +1619,7 @@ pub(crate) mod rust {
     let tx_types_1d = get_1d_tx_types(tx_type);
 
     // perform inv txfm on every row
-    let range = bd + 8;
+    let range = BD + 8;
     let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3];
     // 64 point transforms only signal 32 coeffs. We only take chunks of 32
     //   and skip over the last 32 transforms here.
@@ -1645,7 +1645,7 @@ pub(crate) mod rust {
     }
 
     // perform inv txfm on every col
-    let range = cmp::max(bd + 6, 16);
+    let range = cmp::max(BD + 6, 16);
     let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3];
     for c in 0..width {
       let mut temp_in: [i32; 64] = [0; 64];
@@ -1664,7 +1664,7 @@ pub(crate) mod rust {
         .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
       {
         let v: i32 = (*out).as_();
-        let v = clamp(v + round_shift(*temp, 4), 0, (1 << bd) - 1);
+        let v = clamp(v + round_shift(*temp, 4), 0, (1 << BD) - 1);
         *out = T::cast_from(v);
       }
     }
diff --git a/src/transform/mod.rs b/src/transform/mod.rs
index d14913e133..d05dc6dfe5 100644
--- a/src/transform/mod.rs
+++ b/src/transform/mod.rs
@@ -450,7 +450,7 @@ mod test {
   use crate::frame::*;
   use rand::random;
 
-  fn test_roundtrip<T: Pixel>(
+  fn test_roundtrip<T: Pixel, const BD: usize>(
     tx_size: TxSize, tx_type: TxType, tolerance: i16,
   ) {
     let cpu = CpuFeatureLevel::default();
@@ -474,14 +474,20 @@ mod test {
       *d = T::cast_from(random::<u8>());
       *r = i16::cast_from(*s) - i16::cast_from(*d);
     }
-    forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu);
-    inverse_transform_add(
+    forward_transform::<_, BD>(
+      res,
+      freq,
+      tx_size.width(),
+      tx_size,
+      tx_type,
+      cpu,
+    );
+    inverse_transform_add::<_, BD>(
       freq,
       &mut dst.as_region_mut(),
       coeff_area,
       tx_size,
       tx_type,
-      8,
       cpu,
     );
 
@@ -526,7 +532,7 @@ mod test {
     }
   }
 
-  fn roundtrips<T: Pixel>() {
+  fn roundtrips<T: Pixel, const BD: usize>() {
     let combinations = [
       (TX_4X4, DCT_DCT, 0),
       (TX_4X4, ADST_DCT, 0),
@@ -577,17 +583,17 @@ mod test {
     ];
     for &(tx_size, tx_type, tolerance) in combinations.iter() {
       println!("Testing combination {:?}, {:?}", tx_size, tx_type);
-      test_roundtrip::<T>(tx_size, tx_type, tolerance);
+      test_roundtrip::<T, BD>(tx_size, tx_type, tolerance);
     }
   }
 
   #[test]
   fn roundtrips_u8() {
-    roundtrips::<u8>();
+    roundtrips::<u8, 8>();
   }
 
   #[test]
   fn roundtrips_u16() {
-    roundtrips::<u16>();
+    roundtrips::<u16, 10>();
   }
 }