From 38f98c8c997549f3d3461a3f44e35f4510972ba7 Mon Sep 17 00:00:00 2001 From: Zen Date: Fri, 29 Jan 2021 10:52:12 +0200 Subject: [PATCH 1/5] add required type_ascription --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index ad4899b380..d7d76487e3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -57,6 +57,7 @@ #![warn(clippy::needless_continue)] #![warn(clippy::path_buf_push_overwrite)] #![warn(clippy::range_plus_one)] +#![feature(type_ascription)] // Override assert! and assert_eq! in tests #[cfg(test)] From 2e10e9b50fa65d8ed986eb2d6dbbaf3a63ca6b32 Mon Sep 17 00:00:00 2001 From: Zen Date: Fri, 29 Jan 2021 10:52:21 +0200 Subject: [PATCH 2/5] wip parallelization for cdef --- src/cdef.rs | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/cdef.rs b/src/cdef.rs index d65bb78e60..5f2137c5c1 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -13,6 +13,7 @@ use crate::encoder::FrameInvariants; use crate::frame::*; use crate::tiling::*; use crate::util::{clamp, msb, CastFromPrimitive, Pixel}; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use rust_hawktracer::*; use crate::cpu_features::CpuFeatureLevel; @@ -600,19 +601,29 @@ pub fn cdef_filter_tile( let fb_height = (output.planes[0].rect().height + 63) / 64; // should parallelize this + let mut queue: Vec<(usize, usize)> = Vec::new(); + for fby in 0..fb_height { for fbx in 0..fb_width { - // tile_sbo is treated as an offset into the Tiles' plane - // regions, not as an absolute offset in the visible frame. The - // Tile's own offset is added to this in order to address into - // the input Frame. - let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby }); - let cdef_index = tb.get_cdef(tile_sbo); - let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); - - cdef_filter_superblock( - fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, - ); - } - } + queue.push((fbx, fby)); + }} + + queue.par_iter() + .for_each(|tpl| filter_tile( tpl: &(usize, usize), fi: &FrameInvariants, input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>)); } + +#[hawktracer(filter_tile)] +pub fn filter_tile( + tpl: &(usize, usize), fi: &FrameInvariants, input: &Frame, + tb: &TileBlocks, output: &mut TileMut<'_, T>) { + // tile_sbo is treated as an offset into the Tiles' plane + // regions, not as an absolute offset in the visible frame. The + // Tile's own offset is added to this in order to address into + // the input Frame. + let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: tpl.0, y: tpl.1 }); + let cdef_index = tb.get_cdef(tile_sbo); + let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); + cdef_filter_superblock( + fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, + ); +} \ No newline at end of file From a638bd330574a5f9a37693f3d556d3e160696dbc Mon Sep 17 00:00:00 2001 From: Zen Date: Fri, 29 Jan 2021 13:43:19 +0200 Subject: [PATCH 3/5] remove ascription --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d7d76487e3..ad4899b380 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -57,7 +57,6 @@ #![warn(clippy::needless_continue)] #![warn(clippy::path_buf_push_overwrite)] #![warn(clippy::range_plus_one)] -#![feature(type_ascription)] // Override assert! and assert_eq! in tests #[cfg(test)] From 7bb810ff65b7a47e984c0662b621a6b11f5420c5 Mon Sep 17 00:00:00 2001 From: Zen Date: Fri, 29 Jan 2021 14:12:17 +0200 Subject: [PATCH 4/5] wip, no unnecesary notation --- src/cdef.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cdef.rs b/src/cdef.rs index 5f2137c5c1..03dca7b83a 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -601,21 +601,22 @@ pub fn cdef_filter_tile( let fb_height = (output.planes[0].rect().height + 63) / 64; // should parallelize this - let mut queue: Vec<(usize, usize)> = Vec::new(); + let mut queue: Vec<(usize, usize, TileMut<'_, T>)> = Vec::new(); for fby in 0..fb_height { for fbx in 0..fb_width { - queue.push((fbx, fby)); - }} + queue.push((fbx, fby, output)); + } + } - queue.par_iter() - .for_each(|tpl| filter_tile( tpl: &(usize, usize), fi: &FrameInvariants, input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>)); + queue.par_iter().for_each(|tpl| filter_tile(tpl, fi, input, tb, output)); } #[hawktracer(filter_tile)] pub fn filter_tile( - tpl: &(usize, usize), fi: &FrameInvariants, input: &Frame, - tb: &TileBlocks, output: &mut TileMut<'_, T>) { + tpl: &(usize, usize, &mut TileMut<'_, T>), fi: &FrameInvariants, + input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>, +) { // tile_sbo is treated as an offset into the Tiles' plane // regions, not as an absolute offset in the visible frame. The // Tile's own offset is added to this in order to address into @@ -624,6 +625,6 @@ pub fn filter_tile( let cdef_index = tb.get_cdef(tile_sbo); let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); cdef_filter_superblock( - fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, + fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, ); -} \ No newline at end of file +} From 54bddecd01d5b7ae3b05c7f437bce400a9d82167 Mon Sep 17 00:00:00 2001 From: Zen Date: Thu, 4 Feb 2021 21:47:36 +0200 Subject: [PATCH 5/5] CDEF parallelization wip/mvp --- src/cdef.rs | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/cdef.rs b/src/cdef.rs index 03dca7b83a..823e701c18 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -13,11 +13,16 @@ use crate::encoder::FrameInvariants; use crate::frame::*; use crate::tiling::*; use crate::util::{clamp, msb, CastFromPrimitive, Pixel}; -use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::ParallelIterator; +use rayon::prelude::*; use rust_hawktracer::*; use crate::cpu_features::CpuFeatureLevel; -use std::cmp; +use std::{ + cmp, + ops::DerefMut, + sync::{Arc, Mutex}, +}; cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { @@ -600,31 +605,46 @@ pub fn cdef_filter_tile( let fb_width = (output.planes[0].rect().width + 63) / 64; let fb_height = (output.planes[0].rect().height + 63) / 64; - // should parallelize this - let mut queue: Vec<(usize, usize, TileMut<'_, T>)> = Vec::new(); + let mut queue: Vec<(usize, usize, Arc>>)> = + Vec::new(); + let shared_output = Arc::new(Mutex::new(output)); for fby in 0..fb_height { for fbx in 0..fb_width { - queue.push((fbx, fby, output)); + queue.push((fbx, fby, shared_output.clone())); } } - queue.par_iter().for_each(|tpl| filter_tile(tpl, fi, input, tb, output)); + queue.into_par_iter().for_each(|tpl| filter_tile(tpl, fi, input, tb)); } #[hawktracer(filter_tile)] pub fn filter_tile( - tpl: &(usize, usize, &mut TileMut<'_, T>), fi: &FrameInvariants, - input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>, + tpl: (usize, usize, Arc>>), + fi: &FrameInvariants, input: &Frame, tb: &TileBlocks, ) { // tile_sbo is treated as an offset into the Tiles' plane // regions, not as an absolute offset in the visible frame. The // Tile's own offset is added to this in order to address into // the input Frame. - let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: tpl.0, y: tpl.1 }); + let (fbx, fby, shared_output) = tpl; + let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby }); let cdef_index = tb.get_cdef(tile_sbo); let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); + loop { + if shared_output.try_lock().is_ok() { + break; + } + } + let mut output = shared_output.lock().unwrap(); + cdef_filter_superblock( - fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, + fi, + input, + output.deref_mut(), + tb, + tile_sbo, + cdef_index, + &cdef_dirs, ); }