;***************************************************************************
; unit:    raster      release 0.38                                        *
; purpose: general manipulation n dimensional matrices n = 1, 2 and 3.     *
;          Use this file or rasterc.c. You cannot link both files together *
; licency:     GPL or LGPL                                                 *
; Copyright: (c) 1998-2025 Jaroslav Fojtik                                 *
;***************************************************************************

.586              ;Target processor.  Use instructions for Pentium class machines
.XMM
.MODEL FLAT, C    ;Use the flat memory model. Use C calling conventions

.CODE             ;Indicates the start of a code segment.


; https://www.plantation-productions.com/Webster/www.artofasm.com/Linux/HTML/TheMMXInstructionSeta2.html
; https://docs.oracle.com/cd/E19253-01/817-5477/eojdc/index.html


;void Conv4_8_SSE(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_8_SSE
Conv4_8_SSE proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

        sub	ecx,16
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	punpcklbw xmm4,xmm4
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
	punpcklbw xmm3,xmm3
PIXEL16:movq	xmm0,qword ptr[esi]	; FEDCBA9876543210
	movq	xmm1,xmm0
	add	esi,8
	pand	xmm0,xmm3		; -E-C-A-8-6-4-2-0
	movq	xmm2,xmm0
	psllw	xmm2,4			; E-C-A-8-6-4-2-0-    no shift operation for bytes - never mind use words
	por	xmm0,xmm2		; EECCAA8866442200

	pand	xmm1,xmm4		; F-D-B-9-7-5-3-1-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -F-D-B-9-7-5-3-1
	por	xmm1,xmm2		; FFDDBB9977553311
	
	punpcklbw xmm1,xmm0		; FEEEDDCCBBAA99887766554433221100
	movq	qword ptr [edi],xmm1		
	movhps	qword ptr [edi+8],xmm1		
	add	edi,16
	sub	ecx,16
	jae	PIXEL16
;	emms

PIXEL1:	add	ecx,16
	jz	ToEnd			; array has zero size or all done        
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax		; 21 21
	rol	ax,4		; 12 12
	and	dx,00FF0h	;  2 1
	and	ax,0F00Fh	; 2   1
	or	ax,dx
	sub	ecx,2
	jb	ToEndStor1
	stosw
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
        
ToEndStor1:			; one remaining byte needs to be stored
	stosb
	ret        
                
Conv4_8_SSE endp


;*************************************************************************************


;void Conv4_16_SSE(WORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_16_SSE
Conv4_16_SSE proc \
        uses edi esi, \
        Dest:ptr word, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

        sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[esi]	; 87654321
	movq	xmm1,xmm0
	add	esi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2			; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1
	movlps	qword ptr [edi],xmm1		; 88887777666655554444333322221111
	movhps	qword ptr [edi+8],xmm1
	add	edi,16
	sub	ecx,8
	jae	PIXEL8
;	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	sub	ecx,2
	jb	ToEndStor1	; only 1 pixel is remaining
	stosd
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosw
        ret                     ; _cdecl return        
                
Conv4_16_SSE endp


;*************************************************************************************


;void Conv4_32_SSE(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_32_SSE
Conv4_32_SSE proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

	sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[esi]	; 87654321
	movq	xmm1,xmm0
	add	esi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2			; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1		; 88887777666655554444333322221111
	movaps	xmm0,xmm1
	punpcklwd xmm0,xmm0
	movlps	qword ptr [edi],xmm0
	movhps	qword ptr [edi+8],xmm0
	punpckhwd xmm1,xmm1
	movlps	qword ptr [edi+16],xmm1
	movhps	qword ptr [edi+24],xmm1

	add	edi,32
	sub	ecx,8
	jae	PIXEL8
;	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	ecx,2
	jb	ToEnd
	stosd			; prezerves ZF
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
                
Conv4_32_SSE endp


;*************************************************************************************


;void Conv4_64_SSE(uint64_t *Dest, const uint8_t *Src, unsigned Size1D)
        public  Conv4_64_SSE
Conv4_64_SSE proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

	sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[esi]	; 87654321
	movq	xmm1,xmm0
	add	esi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2		; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1		; 88887777666655554444333322221111
	movaps	xmm0,xmm1
	punpcklwd xmm0,xmm0		; 44444444333333332222222211111111
	movaps	xmm2,xmm0
	punpcklwd xmm0,xmm0		; 22222222222222221111111111111111
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0
	punpckhwd xmm2,xmm2
	movlps	qword ptr[edi+16],xmm2
	movhps	qword ptr[edi+24],xmm2
	
	punpckhwd xmm1,xmm1		; 88888888777777776666666655555555
	movaps	xmm2,xmm1
	punpcklwd xmm1,xmm1		; 66666666666666665555555555555555
	movlps	qword ptr[edi+32],xmm1
	movhps	qword ptr[edi+40],xmm1
	punpckhwd xmm2,xmm2		; 88888888888888887777777777777777
	movlps	qword ptr[edi+48],xmm2
	movhps	qword ptr[edi+56],xmm2

	add	edi,64
	sub	ecx,8
	jae	PIXEL8
;	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld
        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	ecx,2
	jb	ToEnd
	stosd			; prezerves ZF
	stosd
	jnz	PIXEL        

ToEnd:	ret			; _cdecl return
                
Conv4_64_SSE endp


;*************************************************************************************


        public  Conv8_4_SSE
Conv8_4_SSE proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
       	sub	ecx,16
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	xmm3,eax
        punpckldq xmm3,xmm3
        punpckldq xmm3,xmm3
PIXEL16:movlps	xmm0,qword ptr [esi]	; 7h 7l|6h 6l|5h 5l|4h 4l | 3h 3l|2h 2l|1h 1l|0h 0l
	movhps	xmm0,qword ptr [esi+8]
	movaps	xmm1,xmm0
	add	esi,16
	pand	xmm0,xmm3	;  -  -  6h -| -  -  4h - | -  -  2h -| -  -  0h - 
	psrlw	xmm1,12		;  -  -  - 7h| -  -  - 5h |-  -  - 3h| -  -  - 1h
	por	xmm0,xmm1	;  -  -  6h7h| -  -  4h5h |-  -  2h3h| -  -  0h1h
	packuswb xmm0,xmm0
	movq	qword ptr [edi],xmm0
	add	edi,8
	sub	ecx,16
        jae	PIXEL16;	emms	
        
PIXEL1:	add	ecx,16
        jz	ToEnd
        cld
PIXEL:	lodsb			; load 1st byte
	and	al,0F0h
	
	dec	ecx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	ToEnd

NIBBLE2:mov	ah,al
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
ToEnd:	ret                     ; _cdecl return
                
Conv8_4_SSE endp


;*************************************************************************************


        public  Conv8_16_SSE
Conv8_16_SSE proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,16
        jl	PIXEL1
       
PIXEL16:movlps	xmm0,qword ptr[esi]		; pixels 1,2,3,4,5,6,7,8
	movhps	xmm0,qword ptr[esi+8]
	movaps	xmm1,xmm0
	add	esi,16
	punpcklbw xmm0,xmm0
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0
	punpckhbw xmm1,xmm1
	movlps	qword ptr[edi+16],xmm1
	movhps	qword ptr[edi+24],xmm1

	add	edi,32
	sub	ecx,16
        jae	PIXEL16
;	emms

PIXEL1: add	ecx,16
        jz	ToEnd		; array has zero size 
        cld
PIXEL:	lodsb
	mov	ah,al
	stosw
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_16_SSE endp


;*************************************************************************************


        public  Conv8_32_SSE
Conv8_32_SSE proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,8
        jl	PIXEL1
       
PIXEL8: movq	xmm0,qword ptr[esi]			; pixels 1,2,3,4
	add	esi,8
	punpcklbw xmm0,xmm0
	movaps	xmm1,xmm0
	punpcklwd xmm0,xmm0	
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0
	punpckhwd xmm1,xmm1
	movlps	qword ptr[edi+16],xmm1
	movhps	qword ptr[edi+24],xmm1
	add	edi,32
	sub	ecx,8
        jae	PIXEL8
;	emms

PIXEL1: cld
	add	ecx,8
        jz	ToEnd		; array has zero size 

PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_32_SSE endp


;*************************************************************************************

;void Conv8_64_SSE(QWORD *Dest, const BYTE *Src, unsigned Size1D);
        public  Conv8_64_SSE
Conv8_64_SSE proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        jecxz	ToEnd		; array has zero size

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
        sub	ecx,4
        jl	PIXEL1
        
PIXEL4: movd	xmm0,dword ptr[esi]			; pixels 1,2,3,4
	add	esi,4
	punpcklbw xmm0,xmm0
	punpcklwd xmm0,xmm0
	movaps	xmm1,xmm0
	punpckldq xmm0,xmm0
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0	
	punpckhdq xmm1,xmm1
	movlps	qword ptr[edi+16],xmm1
	movhps	qword ptr[edi+24],xmm1
	
	add	edi,32
	sub	ecx,4
        jae	PIXEL4
;	emms

PIXEL1: add	ecx,4
	jz	toend		; array has zero size
        cld
PIXEL:	lodsb			; Not using multiplication here, it would need to store ebx and speedup effect is negligible.
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	stosd
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv8_64_SSE endp


;*************************************************************************************


        public  Conv16_4_SSE
Conv16_4_SSE proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
       	sub	ecx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	xmm3,eax
        punpckldq xmm3,xmm3
PIXEL8: movlps	xmm0,qword ptr[esi]	; 8888 7777 6666 5555 4444 3333 2222 1111
	movhps	xmm0,qword ptr[esi+8]
	add	esi,16
	psrlw	xmm0,8		; 0088 0077 0066 0055 0044 0033 0022 0011
	packuswb xmm0,xmm0	; 88 77 66 55 44 33 22 11
	movq	xmm1,xmm0
	pand	xmm0,xmm3	
	psrlw	xmm1,12
	por	xmm0,xmm1
	packuswb xmm0,xmm0	; 3h 3l|2h 2l|1h 1l|0h 0l
	movd	dword ptr [edi],xmm0
	add	edi,4
	sub	ecx,8
        jae	PIXEL8
;	emms	
        
PIXEL1:	add	ecx,8
        jz	ToEnd
        cld
PIXEL:	inc	esi
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	ecx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	ToEnd

NIBBLE2:mov	ah,al
	inc	si
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
ToEnd:	ret                     ; _cdecl return
                
Conv16_4_SSE endp


;*************************************************************************************

        public  Conv16_8_SSE
Conv16_8_SSE proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,8
        jl	PIXEL1

PIXEL8:	movlps	xmm0,qword ptr[esi]			; pixels 1,2,3,4
	movhps	xmm0,qword ptr[esi+8]
	add	esi,16
	psrlw	xmm0,8
	packuswb xmm0,xmm0
	movq	qword ptr [edi],xmm0

	add	edi,8
	sub	ecx,8
        jae	PIXEL8
;	emms

PIXEL1: cld
	add	ecx,8
        jz	ToEnd		; array has zero size 
        
PIXEL:	lodsw
	mov	al,ah
	stosb
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv16_8_SSE endp


;*************************************************************************************


        public  Conv16_32_SSE
Conv16_32_SSE proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        jecxz	ToEnd		; array has zero size

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;

	sub	ecx,4
        jl	PIXEL1
        
PIXEL4: movq	xmm0,qword ptr[esi]			; pixels 1,2
	add	esi,8
	punpcklwd xmm0,xmm0
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0
	add	edi,16
	sub	ecx,4
        jae	PIXEL4
;	emms

PIXEL1: add	ecx,4
	jz	ToEnd
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
ToEnd:
        ret                     ; _cdecl return
                
Conv16_32_SSE endp


;*************************************************************************************

        public  Conv16_64_SSE
Conv16_64_SSE proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
        sub	ecx,2
        jl	PIXEL1

PIXEL2: movd	xmm0,dword ptr[esi]			; pixels 1,2
	add	esi,4
	punpcklwd xmm0,xmm0				; 2 2 2 2 1 1 1 1
	punpckldq xmm0,xmm0
	movlps	qword ptr[edi],xmm0
	movhps	qword ptr[edi+8],xmm0
	add	edi,16
	sub	ecx,2
        jae	PIXEL2
;	emms

PIXEL1: add	ecx,2
	jz	ToEnd
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	stosd
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv16_64_SSE endp



;*************************************************************************************


        public  Conv32_16_SSE
Conv32_16_SSE proc \
        uses edi esi, \
        Dest:ptr word, \
        Src:ptr dword, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;

	sub	ecx,4
        jl	PIXEL1
        
PIXEL4: movlps	xmm0,qword ptr[esi]	; dword pixels 1,2,3,4
	movhps	xmm0,qword ptr[esi+8]
	add	esi,16
	psrld	xmm0,16
	packssdw xmm0,xmm0
	movq	qword ptr [edi],xmm0
	add	edi,8
	sub	ecx,4
        jae	PIXEL4
;	emms        

PIXEL1: add	ecx,4
        jz	ToEnd		; array has zero size or all job done
        cld
PIXEL:	add	esi,2
	movsw
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv32_16_SSE endp


;*************************************************************************************


        public  Conv32_64_SSE
Conv32_64_SSE proc \
        uses esi, \
        Dest:ptr qword, \
        Src:ptr dword, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edx,[Dest]     ; di=first pointer (es=segment part)
        or	edx,edx
	jz	ToEnd
        mov     esi,[Src]      ;
        or	esi,esi
	jz	ToEnd

        sub	ecx,2
        jl	PIXEL1
        
PIXEL2: movq	xmm0,qword ptr [esi]
	add	esi,8
	punpckldq  xmm0,xmm0
	movlps	qword ptr[edx],xmm0
	movhps	qword ptr[edx+8],xmm0
	add	edx,16
	sub	ecx,2
	jae	PIXEL2
        
PIXEL1:	add	ecx,2
	jz	ToEnd
PIXEL:	movd	xmm0,dword ptr [esi]
	add	esi,4
	punpckldq  xmm0,xmm0
	movq	qword ptr [edx],xmm0
	add	edx,8
	loop	PIXEL
;	emms

ToEnd:	ret                     ; _cdecl return
                
Conv32_64_SSE endp



        end
