Additive Alpha Blending Routine in Assembly

xor eax, eax // Initialize the variables. This is VC++ 6 xor ebx, ebx // inline Assembler. Mostly this is designed mov ecx, xheight // for a DirectDraw locked surface. xor edx, edx mov esi, ddsdSource.lpSurface // This variable should be the pointer to the source surface. mov edi, ddsdDest.lpSurface // This variable should be the pointer to the dest surface. heightLoop16bit565: push ecx mov ecx, xwidth widthLoop16bit565: mov ax, [esi] // Reads the 16-bit pixel (this is designed for 565 RGB). test ax, ax // Tests it with an AND to itself. Sets flags, basically. jz exLoop16bit565 // Skips if pure black (transparent in my game). Use cmp ax, value // if black isn't transparent for you. Take out this line and the previous // if you don't need a transparent additive effect. mov dx, [edi] and edx, 63454 // Take out the last bits of red, green, and blue. and eax, 63454 // This ensures the last bits aren't used by the colors. add eax, edx // This is necessary later on. Total source and dest up into a 32-bit register. mov ebx, eax // EBX is going to be used as our temporary register here. and ebx, 65536 // AND it by 65536, which strips out all but the last bit if it overflowed past the max. jz noRedLoop16bitAAB565 // Skips the next line if an overflow did not occur. or eax, 63488 // This OR turns on all the red bits. noRedLoop16bitAAB565: mov ebx, eax // We do the same thing to green. and ebx, 2048 // Oh and don't worry about bit 17 being turned on in red, jz noGreenLoop16bitAAB565 // that will be solved later. or eax, 2016 noGreenLoop16bitAAB565: mov ebx, eax // Finally we process blue. and ebx, 32 // Almost done here! jz noBlueLoop16bitAAB565 or eax, 31 noBlueLoop16bitAAB565: mov [edi], ax // Move the latter 16-bits of the pixel into the destination. exLoop16bit565: add esi, 2 // Add 2 bytes to the source pointer, to go to the next pixel. add edi, 2 // Ditto with the destination pointer. dec ecx // Decrease ECX. As you may have noticed, height was PUSHed in ECX, // and we did that to make room for width. x86 needs more registers. jnz widthLoop16bit565 // If we aren't out of pixels on that row, go back for another pixel. pop ecx // Pulls the height from the stack. Height is now in ECX for a sec. add esi, ipitchsrc // ipitchsrc should be the pitch DirectDraw gave you in Lock() add edi, ipitchdest // minus twice the source width. ipitchdest should be the same thing for dec ecx // the destination surface. We're almost done, now we are decreasing jnz heightLoop16bit565 // the height and looping back for the next row.