dist=L_shr(dist,4);// make sure that the next multiplication does not overflow
tmp=L_mult(*cdbk_ptr++,cdbk_fix);// Q12->Q16
dist=L_sub(snsq_fx[j],tmp);// Q16
dist=L_shl(dist,11);// cdbk_ptr is a 16 bit LUT with 3.12 values, used as 3.16. assumption: snsq_fx has the same representation. thus, the subtraction results are in 4.16, which leaves 11 bit headroom.
dist=Mpy_32_32(dist,dist);
dist=L_shr(dist,3);// make sure that the sum of 8 values does not overflow
tmp2=extract_l(dist);
dist=L_mult(tmp2,tmp2);
dist=L_shr(dist,4);// make sure that the sum does not overflow