/*
 *  linux/arch/arm26/lib/csumpartialcopygeneric.S
 *
 *  Copyright (C) 1995-2001 Russell King
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * JMA 01/06/03 Commented out some shl0s; probobly irrelevant to arm26 
 *
 */

/*
 * unsigned int
 * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
 *  r0 = src, r1 = dst, r2 = len, r3 = sum
 *  Returns : r0 = checksum
 *
 * Note that 'tst' and 'teq' preserve the carry flag.
 */

/* Quick hack */
                .macro  save_regs
                stmfd   sp!, {r1, r4 - r8, fp, ip, lr, pc}
                .endm

/* end Quick Hack */

src	.req	r0
dst	.req	r1
len	.req	r2
sum	.req	r3

.zero:		mov	r0, sum
		load_regs	ea

		/*
		 * Align an unaligned destination pointer.  We know that
		 * we have >= 8 bytes here, so we don't need to check
		 * the length.  Note that the source pointer hasn't been
		 * aligned yet.
		 */
.dst_unaligned:	tst	dst, #1
		beq	.dst_16bit

		load1b	ip
		sub	len, len, #1
		adcs	sum, sum, ip, lsl #byte(1)	@ update checksum
		strb	ip, [dst], #1
		tst	dst, #2
		moveq	pc, lr			@ dst is now 32bit aligned

.dst_16bit:	load2b	r8, ip
		sub	len, len, #2
		adcs	sum, sum, r8, lsl #byte(0)
		strb	r8, [dst], #1
		adcs	sum, sum, ip, lsl #byte(1)
		strb	ip, [dst], #1
		mov	pc, lr			@ dst is now 32bit aligned

		/*
		 * Handle 0 to 7 bytes, with any alignment of source and
		 * destination pointers.  Note that when we get here, C = 0
		 */
.less8:		teq	len, #0			@ check for zero count
		beq	.zero

		/* we must have at least one byte. */
		tst	dst, #1			@ dst 16-bit aligned
		beq	.less8_aligned

		/* Align dst */
		load1b	ip
		sub	len, len, #1
		adcs	sum, sum, ip, lsl #byte(1)	@ update checksum
		strb	ip, [dst], #1
		tst	len, #6
		beq	.less8_byteonly

1:		load2b	r8, ip
		sub	len, len, #2
		adcs	sum, sum, r8, lsl #byte(0)
		strb	r8, [dst], #1
		adcs	sum, sum, ip, lsl #byte(1)
		strb	ip, [dst], #1
.less8_aligned:	tst	len, #6
		bne	1b
.less8_byteonly:
		tst	len, #1
		beq	.done
		load1b	r8
		adcs	sum, sum, r8, lsl #byte(0)	@ update checksum
		strb	r8, [dst], #1
		b	.done

FN_ENTRY
		mov	ip, sp
		save_regs
		sub	fp, ip, #4

		cmp	len, #8			@ Ensure that we have at least
		blo	.less8			@ 8 bytes to copy.

		adds	sum, sum, #0		@ C = 0
		tst	dst, #3			@ Test destination alignment
		blne	.dst_unaligned		@ align destination, return here

		/*
		 * Ok, the dst pointer is now 32bit aligned, and we know
		 * that we must have more than 4 bytes to copy.  Note
		 * that C contains the carry from the dst alignment above.
		 */

		tst	src, #3			@ Test source alignment
		bne	.src_not_aligned

		/* Routine for src & dst aligned */

		bics	ip, len, #15
		beq	2f

1:		load4l	r4, r5, r6, r7
		stmia	dst!, {r4, r5, r6, r7}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		adcs	sum, sum, r6
		adcs	sum, sum, r7
		sub	ip, ip, #16
		teq	ip, #0
		bne	1b

2:		ands	ip, len, #12
		beq	4f
		tst	ip, #8
		beq	3f
		load2l	r4, r5
		stmia	dst!, {r4, r5}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		tst	ip, #4
		beq	4f

3:		load1l	r4
		str	r4, [dst], #4
		adcs	sum, sum, r4

4:		ands	len, len, #3
		beq	.done
		load1l	r4
		tst	len, #2
/*		mov	r5, r4, lsr #byte(0)
FIXME? 0 Shift anyhow!
*/
		beq	.exit
		adcs	sum, sum, r4, push #16
		strb	r5, [dst], #1
		mov	r5, r4, lsr #byte(1)
		strb	r5, [dst], #1
		mov	r5, r4, lsr #byte(2)
.exit:		tst	len, #1
		strneb	r5, [dst], #1
		andne	r5, r5, #255
		adcnes	sum, sum, r5, lsl #byte(0)

		/*
		 * If the dst pointer was not 16-bit aligned, we
		 * need to rotate the checksum here to get around
		 * the inefficient byte manipulations in the
		 * architecture independent code.
		 */
.done:		adc	r0, sum, #0
		ldr	sum, [sp, #0]		@ dst
		tst	sum, #1
		movne	sum, r0, lsl #8
		orrne	r0, sum, r0, lsr #24
		load_regs	ea

.src_not_aligned:
		adc	sum, sum, #0		@ include C from dst alignment
		and	ip, src, #3
		bic	src, src, #3
		load1l	r5
		cmp	ip, #2
		beq	.src2_aligned
		bhi	.src3_aligned
		mov	r4, r5, pull #8		@ C = 0
		bics	ip, len, #15
		beq	2f
1:		load4l	r5, r6, r7, r8
		orr	r4, r4, r5, push #24
		mov	r5, r5, pull #8
		orr	r5, r5, r6, push #24
		mov	r6, r6, pull #8
		orr	r6, r6, r7, push #24
		mov	r7, r7, pull #8
		orr	r7, r7, r8, push #24
		stmia	dst!, {r4, r5, r6, r7}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		adcs	sum, sum, r6
		adcs	sum, sum, r7
		mov	r4, r8, pull #8
		sub	ip, ip, #16
		teq	ip, #0
		bne	1b
2:		ands	ip, len, #12
		beq	4f
		tst	ip, #8
		beq	3f
		load2l	r5, r6
		orr	r4, r4, r5, push #24
		mov	r5, r5, pull #8
		orr	r5, r5, r6, push #24
		stmia	dst!, {r4, r5}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		mov	r4, r6, pull #8
		tst	ip, #4
		beq	4f
3:		load1l	r5
		orr	r4, r4, r5, push #24
		str	r4, [dst], #4
		adcs	sum, sum, r4
		mov	r4, r5, pull #8
4:		ands	len, len, #3
		beq	.done
/*		mov	r5, r4, lsr #byte(0)
FIXME? 0 Shift anyhow
*/
		tst	len, #2
		beq	.exit
		adcs	sum, sum, r4, push #16
		strb	r5, [dst], #1
		mov	r5, r4, lsr #byte(1)
		strb	r5, [dst], #1
		mov	r5, r4, lsr #byte(2)
		b	.exit

.src2_aligned:	mov	r4, r5, pull #16
		adds	sum, sum, #0
		bics	ip, len, #15
		beq	2f
1:		load4l	r5, r6, r7, r8
		orr	r4, r4, r5, push #16
		mov	r5, r5, pull #16
		orr	r5, r5, r6, push #16
		mov	r6, r6, pull #16
		orr	r6, r6, r7, push #16
		mov	r7, r7, pull #16
		orr	r7, r7, r8, push #16
		stmia	dst!, {r4, r5, r6, r7}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		adcs	sum, sum, r6
		adcs	sum, sum, r7
		mov	r4, r8, pull #16
		sub	ip, ip, #16
		teq	ip, #0
		bne	1b
2:		ands	ip, len, #12
		beq	4f
		tst	ip, #8
		beq	3f
		load2l	r5, r6
		orr	r4, r4, r5, push #16
		mov	r5, r5, pull #16
		orr	r5, r5, r6, push #16
		stmia	dst!, {r4, r5}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		mov	r4, r6, pull #16
		tst	ip, #4
		beq	4f
3:		load1l	r5
		orr	r4, r4, r5, push #16
		str	r4, [dst], #4
		adcs	sum, sum, r4
		mov	r4, r5, pull #16
4:		ands	len, len, #3
		beq	.done
/*		mov	r5, r4, lsr #byte(0)
FIXME? 0 Shift anyhow
*/
		tst	len, #2
		beq	.exit
		adcs	sum, sum, r4
		strb	r5, [dst], #1
		mov	r5, r4, lsr #byte(1)
		strb	r5, [dst], #1
		tst	len, #1
		beq	.done
		load1b	r5
		b	.exit

.src3_aligned:	mov	r4, r5, pull #24
		adds	sum, sum, #0
		bics	ip, len, #15
		beq	2f
1:		load4l	r5, r6, r7, r8
		orr	r4, r4, r5, push #8
		mov	r5, r5, pull #24
		orr	r5, r5, r6, push #8
		mov	r6, r6, pull #24
		orr	r6, r6, r7, push #8
		mov	r7, r7, pull #24
		orr	r7, r7, r8, push #8
		stmia	dst!, {r4, r5, r6, r7}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		adcs	sum, sum, r6
		adcs	sum, sum, r7
		mov	r4, r8, pull #24
		sub	ip, ip, #16
		teq	ip, #0
		bne	1b
2:		ands	ip, len, #12
		beq	4f
		tst	ip, #8
		beq	3f
		load2l	r5, r6
		orr	r4, r4, r5, push #8
		mov	r5, r5, pull #24
		orr	r5, r5, r6, push #8
		stmia	dst!, {r4, r5}
		adcs	sum, sum, r4
		adcs	sum, sum, r5
		mov	r4, r6, pull #24
		tst	ip, #4
		beq	4f
3:		load1l	r5
		orr	r4, r4, r5, push #8
		str	r4, [dst], #4
		adcs	sum, sum, r4
		mov	r4, r5, pull #24
4:		ands	len, len, #3
		beq	.done
/*		mov	r5, r4, lsr #byte(0)
FIXME? 0 Shift anyhow
*/
		tst	len, #2
		beq	.exit
		strb	r5, [dst], #1
		adcs	sum, sum, r4
		load1l	r4
/*		mov	r5, r4, lsr #byte(0)
FIXME? 0 Shift anyhow
*/
		strb	r5, [dst], #1
		adcs	sum, sum, r4, push #24
		mov	r5, r4, lsr #byte(1)
		b	.exit