// Inverse analytical texture mapping (ATM)

#version 430

// #pragma optimize(off) // did not solve shader compilation error: C9999: out of memory - internal malloc failed
// Tested: On a RTX3090 machine all shader code can successfully be loaded and run.

// !!! Keep below defines in sync with same named one in atmDemo.cpp
// Disable in shaders specific filters to save code size:
#define REDUCE_CUBIC_B1C0_CODE_SIZE     0	// B-spline
#define REDUCE_CUBIC_B03C03_CODE_SIZE   0 	// Mitchell
#define REDUCE_CUBIC_B05C05_CODE_SIZE   0	// Middle
#define REDUCE_CUBIC_B0C05_CODE_SIZE    0	// Catmull-Rom
#define REDUCE_CUBIC_B0C1_CODE_SIZE     0 	// Strong
#define REDUCE_CUBIC_GENERIC_CODE_SIZE  1	// Generic B and C

#define ENABLE_VERTICAL_EDGE_HANDLING 	1	// Enable 'transpose' filter computation at near-vertical texel-quad edges

#define ENABLE_u_reconstructionFilter 0 // can only be enabled with reduced amount of shaders loaded
                                        // Otherwise: fatal error C9999: out of memory - internal malloc failed
                                        
#define ENABLE_u_showAccumulatedWeights 0 // debug

// #version 450 would be needed for fwidthFine, otherwise:
//     0(197) : error C7532: global function fwidthFine requires "#version 450" or later
//     0(197) : error C0000: ... or #extension GL_ARB_derivative_control : enable
// Instead of using fwidthFine we use fwidth (in from 1.10). We could avoided fwidth also 
// completely, and determine a more exact footprint, and potentially gaining some efficiency.

// Since the 'Material' block variables are overwritten by every call to renderSentence of VSFontLib,
// we will avoid using it here (and elsewhere outside the font shader; it will only complicate things).

// Matrices block filled by VSMathLib
layout (std140) uniform Matrices {
	mat4 u_projViewModelMatrix;
	mat4 u_viewModelMatrix;
    mat3 u_viewMatrix;
	mat3 u_normalMatrix;
};

uniform sampler2D s_texUnit;
uniform int u_filterShape;          // enum atmFilterShape { none = 0, box, tent, cubic_... };
uniform bool u_showAccumulatedWeights;
uniform int u_markOnMagnification;  // 0: off, 1: highlight [u|v]magnified area 2: highlight borders of integer-[u|v]scale transition.
uniform int u_reconstructionFilter; // 0: box, 1: bi-linear interpolation: 2: xD-linear interpolation
//uniform int u_debugFlags[10];

in vec2 v_texCoord;
// layout(pixel_center_integer) in vec4 gl_FragCoord; // puts "pixel centers" at integer coords iso of half-pixel coords.

flat in mat4 mTexGridToVpHom_frag;

layout (location = 0) out vec4 outputAtmInvTexture;

// Function implemented in the respectively atmInvTexture_*.frag shader file.
float Hygx_tent(float xIn, float t, float y0);
float Hygx_cubic_B1C0(float xIn, float t, float y0);
float Hygx_cubic_B03C03(float xIn, float t, float y0);
float Hygx_cubic_B05C05(float xIn, float t, float y0);
float Hygx_cubic_B0C05(float xIn, float t, float y0);
float Hygx_cubic_B0C1(float xIn, float t, float y0);
float Hygx_cubicBC(float xIn_f, float t_f, float y0_f);

// These defines should be inline with atmFilterShape enum in C++ code
#define BOX             1
#define TENT            2
#define CUBIC_B1C0      3
#define CUBIC_B05C05    4
#define CUBIC_B03C03    5
#define CUBIC_B0C05     6
#define CUBIC_B0C1      7
#define CUBIC_BC        8

#define CLIP(x, minv, maxv) ( ((x) <= (minv)) ? (minv) : (((x) >= (maxv)) ? (maxv) : (x)) )
#define SIGN(x)           ( int(x >= 0) - int(x < 0) )  // GLSL sign(0) gives 0 i.s.o 1
#define POW2(x)	((x)*(x))
#define POW3(x)	((x)*(x)*(x))
#define POW4(x)	((x)*(x)*(x)*(x))

mat2 getNormalizedToTexGrid(ivec2 texSize)
{
    mat2 sc = mat2(1); sc[0][0] = texSize.x; sc[1][1] = texSize.y; // scale only
    return sc;
}

#if ENABLE_u_reconstructionFilter
    mat2 getTexGridToNormalized(ivec2 texSize)
    {
        mat2 sc = mat2(1); sc[0][0] = 1.0f/float(texSize.x); sc[1][1] = 1.0f/float(texSize.y);
        return sc;
    }
#endif 

#pragma region BOX filter functions
// Optimized version of Hygx_box(x2, tangent, y0) - Hygx_box(x1, tangent, y0):
float HAngDiff_box(float x1, float x2, float tangent, float y0) {
    return 0.5*tangent * (x2*x2 - x1*x1) + y0*(x2-x1) + 0.5*(x2-x1);
}
// Optimized version for HAngDiff_box with tangent==0
float HAng0Diff_box(float x1, float x2, float y0) {
    return y0*(x2-x1) + 0.5*(x2-x1);
}
#pragma endregion

#if !ENABLE_VERTICAL_EDGE_HANDLING

// prototypes (forward "declaration")
float WtEdgeAB_direct(vec2 a, vec2 b);

float WtEdgeAB(vec2 a, vec2 b) 
{
	return WtEdgeAB_direct(a, b);
}

#else //ENABLE_VERTICAL_EDGE_HANDLING

// prototypes (forward "declaration")
float WtEdgeAB_direct(vec2 a, vec2 b);
float WtEdgeAB_transposed(vec2 a, vec2 b);
float WtHorEdgeAB(float xa, float xb, float yHorizontal);

float HyxDiff_box(float x1, float x2, float y0) { return y0*(x2-x1) + 0.5f*(x2-x1); }
float Hyx_tent(float x, float y);
float Hyx_cubicB1C0(float x, float y);
float Hyx_cubicB03C03(float x, float y);
float Hyx_cubicB05C05(float x, float y);
float Hyx_cubicB0C05(float x, float y);
float Hyx_cubicB0C1(float x, float y);
float Hyx_cubicBC(float x, float y);

// Determines Weight Texel for Edge A-B according to kernel types
float WtEdgeAB(vec2 a, vec2 b)
{
	float dy = abs(b.y - a.y);
	float dx = abs(b.x - a.x);
	// For CPU *double* precision float version, direct mode is fine up to max tangent 100 (89.4 deg),
	// For GPU shader *float* version we need to use max tangent 4 (76 deg) for CubicStrong.
	// Using tangent>4 to switch to transposed gives max diff value of 1 (on range 256) compared to tangent>1,
	// which can be explained by rounding errors.
	bool isDirect = dy < 4 * dx;
	if (isDirect) {
		return WtEdgeAB_direct(a, b);
	}
	else {
		return WtEdgeAB_transposed(a, b);
	}
}

float WtEdgeAB_transposed(vec2 a, vec2 b)
{
	// Note (2024-05-13) without REDUCE_CUBIC_GENERIC_CODE_SIZE compilation of shader code gives:
	// (0) : fatal error C9999: out of memory - internal malloc failed
	
	// See section "4.3 Near-vertical quad edges" of the paper
    // Transposed vertex coordinates:
	vec2 aT = a.yx;
	vec2 bT = b.yx;
	float D = WtEdgeAB_direct(aT, bT); // volume to the west of the edge
	
	float signTan = float(SIGN((b.y - a.y) * (b.x - a.x)));
    float yTminOrMax = ((aT.y * signTan) > (bT.y * signTan)) ? aT.y : bT.y; //volume west of vertical through MIN(xa,xb):
	float _C = WtHorEdgeAB(aT.x, bT.x, yTminOrMax); // = volume to south of horizontal line through MIN(yaT,ybT)

	float yMin = min(a.y, b.y);
	float _B = WtHorEdgeAB(a.x, b.x, yMin);

	float AplusBviaTransposed = _C - D + _B;
    return AplusBviaTransposed;
}

// Determines weight for exact horizontal texel-Edge A-B - ROBUST against xa==xb
//fltAtm WtHorEdgeAB(kernelType kernel, fltAtm xa, fltAtm xb, fltAtm yHorizontal)
float WtHorEdgeAB(float xa, float xb, float yHorizontal)
{
    // We assume CCW triangles: When xb>xa then triangle-bottom edge (neg. area) otherwise triangle-top edge
    float hfw = 2.; // Half Filter Width; initialize for cubic width
    // if not cubic, then adapt:
    if (u_filterShape == BOX)            hfw = 0.5;
    if (u_filterShape == TENT)           hfw = 1.;
	
    // Clipping of the pure horizontal edge
    if (yHorizontal < -hfw) return 0;
    if (yHorizontal > +hfw) yHorizontal = +hfw;
    xa = CLIP(xa, -hfw, +hfw);
    xb = CLIP(xb, -hfw, +hfw);

    // below code can only handle xb>xa, so swap vertices if they are not:
	float signAB = -1;
    if (xa > xb) {
        float xTmp = xb; xb = xa; xa = xTmp;
        signAB = +1;
    }

    float edgeWeight;
	if (u_filterShape == BOX) {
		edgeWeight = HyxDiff_box(xa, xb, yHorizontal);
	}	
	else if (u_filterShape == TENT) {
		float Wa = Hyx_tent(xa, yHorizontal);
        float Wb = Hyx_tent(xb, yHorizontal);
        edgeWeight = Wb - Wa;
	}
	#if !REDUCE_CUBIC_B1C0_CODE_SIZE
	else if (u_filterShape == CUBIC_B1C0) {
		float Wa = Hyx_cubicB1C0(xa, yHorizontal);
		float Wb = Hyx_cubicB1C0(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	#if !REDUCE_CUBIC_B03C03_CODE_SIZE
	else if (u_filterShape == CUBIC_B03C03) {
		float Wa = Hyx_cubicB03C03(xa, yHorizontal);
		float Wb = Hyx_cubicB03C03(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	#if !REDUCE_CUBIC_B05C05_CODE_SIZE
	else if (u_filterShape == CUBIC_B05C05) {
		float Wa = Hyx_cubicB05C05(xa, yHorizontal);
		float Wb = Hyx_cubicB05C05(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	#if !REDUCE_CUBIC_B0C05_CODE_SIZE
	else if (u_filterShape == CUBIC_B0C05) {
		float Wa = Hyx_cubicB0C05(xa, yHorizontal);
		float Wb = Hyx_cubicB0C05(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	#if !REDUCE_CUBIC_B0C1_CODE_SIZE
	else if (u_filterShape == CUBIC_B0C1) {
		float Wa = Hyx_cubicB0C1(xa, yHorizontal);
		float Wb = Hyx_cubicB0C1(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	#if !REDUCE_CUBIC_GENERIC_CODE_SIZE
	else if (u_filterShape == CUBIC_BC) {
		float Wa = Hyx_cubicBC(xa, yHorizontal);
		float Wb = Hyx_cubicBC(xb, yHorizontal);
		edgeWeight = Wb - Wa;
	}
	#endif
	
    return signAB * edgeWeight;
}

#endif //ENABLE_VERTICAL_EDGE_HANDLING

float WtEdgeAB_direct(vec2 a, vec2 b)
{
    float xa = a.x; float ya = a.y; // aliasses
    float xb = b.x; float yb = b.y;

    // We assume CCW triangles: When xb>xa then triangle-bottom edge (neg. area) otherwise triangle-top edge
    
    float hfw = 2.; // Half Filter Width; initialize for cubic width
    // if not cubic, then adapt:
    if (u_filterShape == BOX)            hfw = 0.5;
    if (u_filterShape == TENT)           hfw = 1.;

    // below code can only handle xb>xa, so swap vertices if they are not:
    float signAB = -1;
    if (xa > xb)
    {
        float xTmp = xb; xb = xa; xa = xTmp;
        float yTmp = yb; yb = ya; ya = yTmp;
        signAB = +1;
    }

    // The tangent and y0 (y at x=0) can be derived from line-piece ab
    float dx = xb - xa;
    float dy = yb - ya;
    
	#if 0   // For debugging: enable limit tangent; otherwise overflow in later computations
            // FIXED by transposed filtering
		// --- limit tangent for BOX ---
		// float maxTangentFloat = sqrt(sqrt( std::numeric_limits<float>::max() )) / 100; // 42949672.000000000 ~ 1e+7
		// float maxTangentFloat = 1e+7; float minDx = 1e-7 * 0.0001; // to allow dy to be max 10000 - fine for box
		// --- limit tangent for TENT ---
		float maxTangentFloat = 1e+3 *0.5;    float minDx = 1e-3 * 0.5;  // fine for tent
		// if denominator dx is near zero assign tangent to a max-value that still allows computation room:
		float tangent = (dx < minDx) ? ((dy>0) ? maxTangentFloat : -maxTangentFloat) : dy / dx;
		// alternative, early exit:
		//if (dx < minDx) return 0;
	#else
		float tangent = dy / dx;
	#endif

    // clip line against borders of prefilter footprint
    // Partition into 3 clipped line pieces between x1---x2---x3---x4
    float yaClip, ybClip, x1, x2, x3, x4;
    float edgeWeight1, edgeWeight2, edgeWeight3, Wa, Wb;
    
    // ===== x1---x2: potential horizontal line-piece - in case when |ya| > hfw =====
    x1 = CLIP(xa, -hfw, hfw); // left side horizontal line piece
    // intersection x2 determines right side of a new horizontal line-piece
    float yaHfw = ya - (xa + hfw)*tangent; // y-value at x=-hfw
    float yaAbsMin = (xa<-hfw) ? yaHfw : ya;
    yaClip = CLIP(yaAbsMin, -hfw, hfw);
    x2 = xa - (ya - yaClip) / tangent; // div-by-0 when hor line; result is ok
    x2 = min(x2, xb);   // when rc neg. and xb<hfw & yb>hfw (point B above cell)
    x2 = CLIP(x2, -hfw, hfw);  // when xb>hfw or xb<-hfw (point B to right or left from cell)

    // ===== x2---x3: potential diagonal line-piece =====
    float ybHfw = ya - (xa - hfw)*tangent; // y-value at x=+hfw
    float ybAbsMin = (xb>hfw) ? ybHfw : yb;
    ybClip = CLIP(ybAbsMin, -hfw, hfw);
    x3 = xa - (ya - ybClip) / tangent;
    x3 = min(x3, xb);
    x3 = CLIP(x3, -hfw, hfw);
#if 0 // For debugging: TODO DISABLE - FIXED
		// when x3 and x2 are clipped to same value then following edgeWeight2 results in zero.
		// However below H()-H() computation is unstable if tangent is very large (near vertical edge)
		// and x3-x2 is very small (which is related to tangent being very large), avoid this by:
		if (abs(x3-x2)<0.000001) x3=x2; // this causes edgeWeight2 <-- 0
#endif
    float y0 = ya - xa*tangent;

    // ===== x3---x4 potential horizontal line-piece =====
    x4 = CLIP(xb, -hfw, hfw);

#if 1 // u_filterShape condition tree; choose 0 to choose TENT unconditionally; then speedup: ~ 5%

    if (u_filterShape == BOX) {
        edgeWeight1 = HAng0Diff_box(x1, x2, yaClip);
        //float edgeWeight2 = Hygx_box(x3, tangent, y0) - Hygx_box(x2, tangent, y0); // less optimized
        edgeWeight2 = HAngDiff_box(x2, x3, tangent, y0);
        edgeWeight3 = HAng0Diff_box(x3, x4, ybClip);
    }
    else if (u_filterShape == TENT) {
		//edgeWeight1 = Hygx_tent(x2, 0, yaClip) - Hygx_tent(x1, 0, yaClip);
		edgeWeight1 = Hyx_tent(x2, yaClip) - Hyx_tent(x1, yaClip);
		Wa = Hygx_tent(x2, tangent, y0);
		Wb = Hygx_tent(x3, tangent, y0);
		edgeWeight2 = Wb - Wa;
		//edgeWeight3 = Hygx_tent(x4, 0, ybClip) - Hygx_tent(x3, 0, ybClip);
		edgeWeight3 = Hyx_tent(x4, ybClip) - Hyx_tent(x3, ybClip);
    }
	#if !REDUCE_CUBIC_B1C0_CODE_SIZE
    else if (u_filterShape == CUBIC_B1C0) {
		edgeWeight1 = Hyx_cubicB1C0(x2, yaClip) - Hyx_cubicB1C0(x1, yaClip);
        Wa = Hygx_cubic_B1C0(x2, tangent, y0);
        Wb = Hygx_cubic_B1C0(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicB1C0(x4, ybClip) - Hyx_cubicB1C0(x3, ybClip);
    }
	#endif
	#if !REDUCE_CUBIC_B03C03_CODE_SIZE
    else if (u_filterShape == CUBIC_B03C03) {
		edgeWeight1 = Hyx_cubicB03C03(x2, yaClip) - Hyx_cubicB03C03(x1, yaClip);
        Wa = Hygx_cubic_B03C03(x2, tangent, y0);
        Wb = Hygx_cubic_B03C03(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicB03C03(x4, ybClip) - Hyx_cubicB03C03(x3, ybClip);
    }
	#endif
	#if !REDUCE_CUBIC_B0C1_CODE_SIZE
    else if (u_filterShape == CUBIC_B0C1) {
		edgeWeight1 = Hyx_cubicB0C1(x2, yaClip) - Hyx_cubicB0C1(x1, yaClip);
        Wa = Hygx_cubic_B0C1(x2, tangent, y0);
        Wb = Hygx_cubic_B0C1(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicB0C1(x4, ybClip) - Hyx_cubicB0C1(x3, ybClip);
    } 
	#endif
	#if !REDUCE_CUBIC_B05C05_CODE_SIZE
	else if (u_filterShape == CUBIC_B05C05) {
		edgeWeight1 = Hyx_cubicB05C05(x2, yaClip) - Hyx_cubicB05C05(x1, yaClip);
        Wa = Hygx_cubic_B05C05(x2, tangent, y0);
        Wb = Hygx_cubic_B05C05(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicB05C05(x4, ybClip) - Hyx_cubicB05C05(x3, ybClip);
    } 
	#endif
	#if !REDUCE_CUBIC_B0C05_CODE_SIZE
	else if (u_filterShape == CUBIC_B0C05) {
		edgeWeight1 = Hyx_cubicB0C05(x2, yaClip) - Hyx_cubicB0C05(x1, yaClip);
        Wa = Hygx_cubic_B0C05(x2, tangent, y0);
        Wb = Hygx_cubic_B0C05(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicB0C05(x4, ybClip) - Hyx_cubicB0C05(x3, ybClip);
    } 
	#endif
	#if !REDUCE_CUBIC_GENERIC_CODE_SIZE
    else if (u_filterShape == CUBIC_BC) { // condition needed incode above code is disabled
		edgeWeight1 = Hyx_cubicBC(x2, yaClip) - Hyx_cubicBC(x1, yaClip);
        Wa = Hygx_cubicBC(x2, tangent, y0);
        Wb = Hygx_cubicBC(x3, tangent, y0);
        edgeWeight2 = Wb - Wa;
		edgeWeight3 = Hyx_cubicBC(x4, ybClip) - Hyx_cubicBC(x3, ybClip);
    }
	#endif
    
#else // unconditionally (of u_filterShape) select TENT
    edgeWeight1 = Hygx_tent(x2, 0, yaClip) - Hygx_tent(x1, 0, yaClip);
    Wa = Hygx_tent(x2, tangent, y0);
    Wb = Hygx_tent(x3, tangent, y0);
    edgeWeight2 = Wb - Wa;            
    edgeWeight3 = Hygx_tent(x4, 0, ybClip) - Hygx_tent(x3, 0, ybClip);
#endif
    
    float edgeWeight = signAB * (edgeWeight1 + edgeWeight2 + edgeWeight3);
    return edgeWeight;
}

// Read bi-linear interpolated color from texture bitmap.
// Similar to GLSL's "texture" lookup, except the domain is different (here grid-coordinates; in GLSL unit square).
vec3 textureBilinear(vec2 uvGrid, ivec2 texSize)
{
    // 'max' and 'min' is to avoid reading texels outside of texture   <--- prob not needed in GLSL
    int uGrid0 = max(int(floor(uvGrid.s)), 0); int uGrid1 = min(uGrid0 + 1, texSize.s - 1);
    int vGrid0 = max(int(floor(uvGrid.t)), 0); int vGrid1 = min(vGrid0 + 1, texSize.t - 1);
    
    ivec2 uvTexGrid00 = ivec2(uGrid0, vGrid0);
    ivec2 uvTexGrid10 = ivec2(uGrid1, vGrid0);
    ivec2 uvTexGrid11 = ivec2(uGrid1, vGrid1);
    ivec2 uvTexGrid01 = ivec2(uGrid0, vGrid1);
    
    vec3 c0 = texelFetch(s_texUnit, uvTexGrid00, 0).rgb;
    vec3 c1 = texelFetch(s_texUnit, uvTexGrid10, 0).rgb;
    vec3 c2 = texelFetch(s_texUnit, uvTexGrid11, 0).rgb;
    vec3 c3 = texelFetch(s_texUnit, uvTexGrid01, 0).rgb;
    float uPhase = fract(uvGrid.s);
    vec3 c01Lerp = mix(c0, c1, uPhase);
    vec3 c32Lerp = mix(c3, c2, uPhase);
    float vPhase = fract(uvGrid.t);
    vec3 cBilin = mix(c01Lerp, c32Lerp, vPhase);
    return cBilin;
}


void main()
{
#if 0 // normal texture filtering (controlled by OpenGL glTexParameteri settings)
    vec4 redish = vec4(1.0, 0.9, 0.9, 1.0);
    outputAtmInvTexture = texture(s_texUnit, v_texCoord) * redish;
    
#elif 0 // experiment (no ATM); roughly averaging texels within the "pixels preimage" footrint.
    #define SAMPLES 5 // must be odd
    #define START_SAMPLE -SAMPLES/2
    vec4 colorAccu = vec4(0);
    vec2 preImageWidthApprox = fwidth(v_texCoord);//*5 // conservative width approximation (in normalized texture space)
    vec2 stepTex = preImageWidthApprox / SAMPLES;

    vec2 posTex = START_SAMPLE * stepTex;
    for (int i = 0; i < SAMPLES; i++, posTex.t += stepTex.t)
    {
        posTex.s = START_SAMPLE * stepTex.s;
        for (int j = 0; j < SAMPLES; j++, posTex.s += stepTex.s)

        colorAccu += vec4( texture(s_texUnit, v_texCoord + posTex).rgb, 1 );
    }
    outputAtmInvTexture = colorAccu / (colorAccu.a);    
       
       
#elif 1 // ATM algorithm

	// fwidth -- https://developer.download.nvidia.com/cg/fwidth.html
	// vec2 fwidth(vec2 v) return abs(ddx(v)) + abs(ddy(v));
	// Sum of abs of each approximate partial derivative of v with respect to window-space x and y coordinate.
    vec2 preImageWidthApprox = fwidth(v_texCoord); // Conservative fragment 'width/height' approximation,
    
    // if (u_filterShape == BOX):
    vec2 texHfw = preImageWidthApprox; // Half prefilter width (and height) in normalized texture space.
    if (u_filterShape == TENT) texHfw *= 2.0;
    if (u_filterShape >= CUBIC_B1C0) texHfw *= 4.0; // ">=CUBIC_B1C0" are all cubic filters
  
    ivec2 texSize = textureSize(s_texUnit, 0);
    mat2 scaleNormalizedToTexGrid = getNormalizedToTexGrid(texSize);
    ivec2 texStart = ivec2( floor( scaleNormalizedToTexGrid * (v_texCoord - texHfw) ) );
    ivec2 texEnd   = ivec2( ceil ( scaleNormalizedToTexGrid * (v_texCoord + texHfw) ) );
    
    #if ENABLE_u_reconstructionFilter
        mat2 scaleTexGridToNormalized = getTexGridToNormalized(texSize);
    #endif

    vec4 colorAccu = vec4(0);
    vec2 vVpGrid = vec2( gl_FragCoord.xy - 0.5 ).xy; // perhaps use layout qualifier 'pixel_center_integer' instead?
    
    // V LOOP  ===================================================================
    for (int vGrid = texStart.t; vGrid <= texEnd.t; vGrid++) // later set "<=" back to "<"
    {

        // U LOOP ==========================================================================  
        for (int uGrid = texStart.s; uGrid <= texEnd.s; uGrid++) // later set "<=" back to "<"
        {
            ivec2 texGrid = ivec2(uGrid, vGrid);
            
            // map texel midpoints (i.e. the texel box-reconstructed corners) to viewport to obtain 4 "texel-edges"
            const vec2 texelCornerOffsets[4] = { vec2(-0.5,-0.5), vec2(+0.5,-0.5), vec2(+0.5,+0.5), vec2(-0.5,+0.5) };
            vec2 midPoints[4]; // texel midpoints mapped in viewport space
            // texel-box-rec-corners:   midpoints in viewport:
            //    3 _______ 2                mp3________mp2
            //     |       |     maps to:      /       /
            //     |   X   |                  /   X   /
            //    0|_______|1             mp0/_______/mp1
            for (int i=0; i<4; i++) 
            {
                // derive midpoints in viewport space (having origin at lower-left)
                vec2 texGridTexCorner = vec2(texGrid) + texelCornerOffsets[i];
                vec4 vVpHomTexCorners = mTexGridToVpHom_frag * vec4(texGridTexCorner, 0, 1);
                midPoints[i] = ( vVpHomTexCorners / vVpHomTexCorners.w ).xy;
            }
            float uScale = distance(midPoints[0], midPoints[1]); // mapped unit-U distance
            float vScale = distance(midPoints[1], midPoints[2]); // mapped unit-V distance

            #if ENABLE_u_reconstructionFilter // enable switching between box and bi-linear reconstruction
            if (u_reconstructionFilter == 0)
            #endif
            #pragma region box reconstruction filter
            {
                vec3 texColor = texelFetch(s_texUnit, texGrid, 0).rgb; // set 'reconstructed' color with texel color
                
                // dbg: Instead, use constant color (200) to verify correct unity of weights summation:
                #if ENABLE_u_showAccumulatedWeights
                    // To enable press "s" > "w" demo's UI:
                    texColor = mix(texColor, vec3(200.0/255.0), vec3(u_showAccumulatedWeights));
                #endif
            
                // --- weigh the contribution of the texel  ---
                float wTexel = 0;
                // for each of the 4 mapped texel edges do (redundantly recompute 2 of 4 WtEdgeAB):
                for (int i = 0; i < 4; i++) wTexel += WtEdgeAB( midPoints[i] - vVpGrid, midPoints[(i+1)%4] - vVpGrid );
                vec3 wColor = wTexel * texColor;
                
                // For clarity: when bilinear reconstruction select but disabled, mark magnified pixels black:
                #if !ENABLE_u_reconstructionFilter
                    wColor *= int(uScale < 1.0 && vScale < 1.0 || !bool(u_reconstructionFilter));
                #endif

                colorAccu += vec4( wColor, wTexel );
            }
            #pragma endregion
            #pragma region bilinear reconstruction filter
            #if ENABLE_u_reconstructionFilter
            else // using bilinear interpolation as reconstruction filter when magnification
            {
                float lerpBias = 0; //LERP_BIAS; // to reduce (at the cost of mag-quality) artificial inserted samples
                int uSubSamplesCount = int(max(floor(uScale - lerpBias), 0.0f)) + 1;
                int vSubSamplesCount = int(max(floor(vScale - lerpBias), 0.0f)) + 1;

                vec2 vSubPoints[4]; // quad with same u-length ('width') as midPoints quad; vertically subdivided
                vec2 uSubPoints[4]; // quad with same v-length (height') as vSubPoints quad; horizontally subdivided

                // ===== Loop over artificial inserted sub-samples =====
                // Note: when no magnification (and uSubSamplesCount == 1 and vSubSamplesCount == 1)
                // then one texel 'sub'-sample generated with same footprint (subpoints) as the texel sample (midpoints).
                vSubPoints[0] = vec2(midPoints[0]);
                vSubPoints[1] = vec2(midPoints[1]);

                // TODO: limit vIdx, uIdx loops by determining uSubStart/uSubEnd on basis of bbPreImage and texel-box footprint
                float vStep = 1.0f / vSubSamplesCount;
                for (int vIdx = 0; vIdx < vSubSamplesCount; vIdx++)
                {
                    // vPositionRatio determines 'top' of vIdx'th sub-sample (will be 'bottom' of vIdx+1 sub-sample)
                    float vPositionRatio = vStep * float(vIdx+1); // range [vStep; 1.0]
                    // v-axis (vertical) linear interpolation of subMidPoints
                    vSubPoints[3] = mix(midPoints[0], midPoints[3], vPositionRatio);
                    vSubPoints[2] = mix(midPoints[1], midPoints[2], vPositionRatio);

                    // Determine float coordinate for bilinear interpolation of (magnified) color (center of sub-samples).
                    float vSubSampleGrid = float(vGrid) - 0.5f + vStep/2.0f + vStep * float(vIdx);

                    uSubPoints[0] = vSubPoints[0];
                    uSubPoints[3] = vSubPoints[3];
                    float uStep = 1.0f / uSubSamplesCount;
                    for (int uIdx = 0; uIdx < uSubSamplesCount; uIdx++)
                    {
                        float uRatio = uStep * float(uIdx+1);
                        uSubPoints[1] = mix(vSubPoints[0], vSubPoints[1], uRatio);
                        uSubPoints[2] = mix(vSubPoints[3], vSubPoints[2], uRatio);

                        float uSubSampleGrid = float(uGrid) - 0.5f + uStep/2.0f + uStep * float(uIdx);
                        vec2 uvSubSampleGrid = vec2(uSubSampleGrid, vSubSampleGrid);
                        #if 1
                            vec3 recColor = textureBilinear(uvSubSampleGrid, texSize);
                            //vec3 recColor = texelFetch(s_texUnit, texGrid, 0).rgb; //dbg
                        #else
                            // Next line obtains almost same as above textureBilinear; 
                            // (with GL_TEXTURE_MIN_FILTER <-- GL_NEAREST and GL_TEXTURE_MAG_FILTER <-- GL_LINEAR)
                            // But it does blur also minified part a bit and gives some grainy/noisy artifacts overlay-ed ...
                            // So better not use it and stick to texelFetch()-based textureBilinear()
                            vec3 recColor = texture(s_texUnit, scaleTexGridToNormalized * (vec2(texGrid)+0.5) ).rgb;
                        #endif
                        
                        // dbg: Instead, use constant color (200) to verify correct unity of weights summation:
                        #if ENABLE_u_showAccumulatedWeights
                            // To enable press "s" > "w" demo's UI:
                            recColor = mix(recColor, vec3(200.0/255.0), vec3(u_showAccumulatedWeights));
                        #endif
                            
                        // --- weight the (sub)texel's contribution ---
                        // for each of the 4 mapped (sub)texel edges do
                        float wTexel = 0;
                        for (int i = 0; i < 4; i++)
                        {
                            float wEdge = WtEdgeAB( uSubPoints[i] - vVpGrid, uSubPoints[(i+1)%4] - vVpGrid );
                            wTexel += wEdge;
                        }

                        colorAccu += vec4( wTexel * recColor, wTexel );
                        
                        uSubPoints[0] = uSubPoints[1]; // 'right' points become 'left' points for next 'horizontal' subsample quad
                        uSubPoints[3] = uSubPoints[2];
                    }
                    vSubPoints[0] = vSubPoints[3]; // 'top' points become 'bottom' points for next 'vertical' row of subsample quads
                    vSubPoints[1] = vSubPoints[2];    
                }

            }
            #endif //ENABLE_u_reconstructionFilter
            #pragma endregion
        }
    }
    // mark [u|v]-magnified areas by highlight pixel for 1) magnified area or 2) integer [u|v]scale transitions
    if (u_markOnMagnification > 0) 
    {
        #if 0 // Computing [u|v]scale based on texture GRID (texStart) shows heavy aliasing at strong ('integer-pixel') magnification.
            vec4 vVpHomTexCenter = mTexGridToVpHom_frag * vec4(texStart, 0, 1);
            vec4 vVpHomTexUnitU  = mTexGridToVpHom_frag * vec4(texStart + vec2(1.0, 0.0), 0, 1);
            vec4 vVpHomTexUnitV  = mTexGridToVpHom_frag * vec4(texStart + vec2(0.0, 1.0), 0, 1);
        #else // Instead use floating point interpolant v_texCoord.
            vec2 pixToTexGrid = vec2( getNormalizedToTexGrid(texSize) * v_texCoord );
            vec4 vVpHomTexCenter = mTexGridToVpHom_frag * vec4(pixToTexGrid, 0, 1);
            vec4 vVpHomTexUnitU  = mTexGridToVpHom_frag * vec4(pixToTexGrid + vec2(1.0, 0.0), 0, 1);
            vec4 vVpHomTexUnitV  = mTexGridToVpHom_frag * vec4(pixToTexGrid + vec2(0.0, 1.0), 0, 1);
        #endif
            vec2 vVpTexCenter = ( vVpHomTexCenter / vVpHomTexCenter.w ).xy;
            vec2 vVpTexUnitU = ( vVpHomTexUnitU / vVpHomTexUnitU.w ).xy;
            vec2 vVpTexUnitV = ( vVpHomTexUnitV / vVpHomTexUnitV.w ).xy;
            float uScale = distance(vVpTexCenter, vVpTexUnitU); // mapped unit-U distance
            float vScale = distance(vVpTexCenter, vVpTexUnitV); // mapped unit-V distance
        
        if (u_markOnMagnification == 1) 
        {
            // Colorwash magnified areas
            colorAccu.r += int(uScale > 1.0) * 0.2; // colorwash red hor magnification
            colorAccu.g += int(vScale > 1.0) * 0.2; // colorwash green vert magnification
        } else { // u_markOnMagnification == 2
            // Color transition band of minification-magnification integer [u|v]-scale values:
            colorAccu.r += pow( 1 - 2*abs(uScale - floor(uScale+0.5)), 60/uScale) * 0.5; // function that obtains an
            colorAccu.g += pow( 1 - 2*abs(vScale - floor(vScale+0.5)), 60/vScale) * 0.5; // anti-aliased band        
            
            // experiment: show magnification due dFdx and dFdy (with 'F' the texture-grid coordinates u or v):
            // result: this gives similar border as the above code but not same (but probably faster).
            // colorAccu.r += int(length(dFdx(getNormalizedToTexGrid(texSize) * v_texCoord)) <= 1.0) * 0.4;
            // colorAccu.g += int(length(dFdy(getNormalizedToTexGrid(texSize) * v_texCoord)) <= 1.0) * 0.4;
        }
    }
    
    outputAtmInvTexture = vec4( colorAccu.rgb, 1);  // LATER, when sum of all wTexel == 1.0

    
#else // some experiment
    // approximation (assuming local linearization) of texel forward mapping of texelCorner
    // Since the texture to viewport is a perspective mapping, this is an approximation.
    // The worry is: that forward mapped "texel-footprints" will not be perfectly adjacent in viewport space.
    mat2 invJacobian = mat2(dFdx(v_texCoord), dFdy(v_texCoord));
    mat2 fwdJacobian = inverse(invJacobian);
    texelCornerInViewport = gl_FragCoord  +  fwdJacobian * (texelCorner - v_texCoord);

#endif
}