[video_core/host_shaders] unroll lanczos loop for slightly better perf (#3754)

Some (Mali) drivers particularly are afraid to unroll loops with more than 7 constant iterations (?); hence manual unrolling is potentially beneficial due to avoiding extra branching + the uniform runtime expectations Signed-off-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3754 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: crueter <crueter@eden-emu.dev> Co-authored-by: lizzie <lizzie@eden-emu.dev> Co-committed-by: lizzie <lizzie@eden-emu.dev>
2026-04-10 03:18:55 +02:00 · 2026-04-06 19:14:13 +02:00 · 2026-04-06 19:14:13 +02:00 · 876884e783
commit 876884e783
parent 028050cf04
1 changed files with 18 additions and 8 deletions
--- a/src/video_core/host_shaders/present_lanczos.frag
+++ b/src/video_core/host_shaders/present_lanczos.frag
@ -24,13 +24,23 @@ vec4 textureLanczos(sampler2D textureSampler, vec2 p) {
    vec2 cc = floor(p * res) / res;
    // kernel size = (2r + 1)^2
    const int r = 3; //radius (1 = 3 steps)
-    for (int x = -r; x <= r; x++)
+#define LANCZOS_LOOP_STEP(x, y) \
-        for (int y = -r; y <= r; y++) {
+    { \
-            vec2 kp = 0.5f * (vec2(x, y) / res); // 0.5 = half-pixel level resampling
+        vec2 kp = 0.5f * (vec2(x, y) / res); /* 0.5 = half-pixel level resampling */ \
-            vec2 uv = cc + kp;
+        vec2 uv = cc + kp; \
-            float w = lanczos(kp, float(r));
+        float w = lanczos(kp, float(r)); \
-            c_sum += w * texture(textureSampler, p + kp).rgb;
+        c_sum += w * texture(textureSampler, p + kp).rgb; \
-            w_sum += w;
+        w_sum += w; \
    }
    for (int y = -r; y <= r; ++y) {
        LANCZOS_LOOP_STEP(-3, y);
        LANCZOS_LOOP_STEP(-2, y);
        LANCZOS_LOOP_STEP(-1, y);
        LANCZOS_LOOP_STEP(-0, y);
        LANCZOS_LOOP_STEP(+1, y);
        LANCZOS_LOOP_STEP(+2, y);
        LANCZOS_LOOP_STEP(+3, y);
    }
    return vec4(c_sum / w_sum, 1.0f);
 }