Skip to content

Commit 68b119f

Browse files
committed
Optimize sgrproj_solve to avoid f64 arithmetic
Results can always fit in i64 regardless of what bit depth is used. It doesn't seem like much vectorization is happening but it is faster. Some quick tests show that switching to a smaller type seems to enable auto-vectorization.
1 parent 664fa46 commit 68b119f

File tree

1 file changed

+52
-10
lines changed

1 file changed

+52
-10
lines changed

src/lrf.rs

+52-10
Original file line numberDiff line numberDiff line change
@@ -929,17 +929,59 @@ pub fn sgrproj_solve<T: Pixel>(
929929
} else {
930930
sgrproj_box_f_r0(&mut f_r1, y, cdef_w, &cdeffed, fi.cpu_feature_level);
931931
}
932-
for x in 0..cdef_w {
933-
let u = i32::cast_from(cdeffed.p(x, y)) << SGRPROJ_RST_BITS;
934-
let s = (i32::cast_from(input.p(x, y)) << SGRPROJ_RST_BITS) - u;
935-
let f2 = f_r2_01[dy][x] as i32 - u;
936-
let f1 = f_r1[x] as i32 - u;
937-
h[0][0] += f2 as f64 * f2 as f64;
938-
h[1][1] += f1 as f64 * f1 as f64;
939-
h[0][1] += f1 as f64 * f2 as f64;
940-
c[0] += f2 as f64 * s as f64;
941-
c[1] += f1 as f64 * s as f64;
932+
933+
#[inline(always)]
934+
fn process_line<T: Pixel>(
935+
h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
936+
f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
937+
) {
938+
let cdeffed_it = cdeffed[..cdef_w].iter();
939+
let input_it = input[..cdef_w].iter();
940+
let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
941+
let f_r1_it = f_r1[..cdef_w].iter();
942+
943+
#[derive(Debug, Copy, Clone)]
944+
struct Sums {
945+
h: [[i64; 2]; 2],
946+
c: [i64; 2],
947+
}
948+
949+
let sums: Sums = cdeffed_it
950+
.zip(input_it)
951+
.zip(f_r2_ab_it.zip(f_r1_it))
952+
.map(|((&u, &i), (&f2, &f1))| {
953+
let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
954+
let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
955+
let f2 = f2 as i32 - u;
956+
let f1 = f1 as i32 - u;
957+
(s as i64, f1 as i64, f2 as i64)
958+
})
959+
.fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
960+
let mut ret: Sums = sums;
961+
ret.h[0][0] += f2 * f2;
962+
ret.h[1][1] += f1 * f1;
963+
ret.h[0][1] += f1 * f2;
964+
ret.c[0] += f2 * s;
965+
ret.c[1] += f1 * s;
966+
ret
967+
});
968+
969+
h[0][0] += sums.h[0][0] as f64;
970+
h[1][1] += sums.h[1][1] as f64;
971+
h[0][1] += sums.h[0][1] as f64;
972+
c[0] += sums.c[0] as f64;
973+
c[1] += sums.c[1] as f64;
942974
}
975+
976+
process_line(
977+
&mut h,
978+
&mut c,
979+
&cdeffed[y],
980+
&input[y],
981+
&f_r1,
982+
&f_r2_01[dy],
983+
cdef_w,
984+
);
943985
}
944986
}
945987

0 commit comments

Comments
 (0)