dnl  Alpha mpn_bdiv_dbm1c.

dnl  Copyright 2008 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C      cycles/limb
C EV4:     42
C EV5:     18
C EV6:      3

C TODO
C  * Try less unrolling, 2-way should give the same performance.
C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
C    code size.
C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
C    path.  We have not tried very hard to find a better algorithm.  Perhaps
C    it would be a good task for the GNU superoptimizer.

C INPUT PARAMETERS
define(`rp', `r16')
define(`up', `r17')
define(`n',  `r18')
define(`bd', `r19')
define(`cy', `r19')


ASM_START()
PROLOGUE(mpn_bdiv_dbm1c)
	mov	r20, r8

	ldq	r24, 0(r17)
	and	r18, 3, r28
	lda	r18, -4(r18)
	beq	r28, L(b0)
	cmpeq	r28, 1, r21
	bne	r21, L(b1)
	cmpeq	r28, 2, r21
	bne	r21, L(b2)


L(b3):	ldq	r2, 8(r17)
	ldq	r3, 16(r17)
	bgt	r18, L(gt3)

	mulq	r24, r19, r5	C U1
	umulh	r24, r19, r21	C U1
	mulq	r2, r19, r6	C U1
	umulh	r2, r19, r22	C U1
	mulq	r3, r19, r7	C U1
	umulh	r3, r19, r23	C U1
	lda	r16, -32(r16)
	br	L(cj3)

L(gt3):	ldq	r0, 24(r17)
	mulq	r24, r19, r5	C U1
	umulh	r24, r19, r21	C U1
	ldq	r1, 32(r17)
	mulq	r2, r19, r6	C U1
	umulh	r2, r19, r22	C U1
	ldq	r2, 40(r17)
	mulq	r3, r19, r7	C U1
	umulh	r3, r19, r23	C U1
	ldq	r3, 48(r17)
	lda	r18, -4(r18)
	lda	r17, 56(r17)
	mulq	r0, r19, r4	C U1
	bgt	r18, L(L3)

	br	L(cj7)


L(b2):	ldq	r3, 8(r17)
	bgt	r18, L(gt2)

	mulq	r24, r19, r6	C U1
	umulh	r24, r19, r22	C U1
	mulq	r3, r19, r7	C U1
	umulh	r3, r19, r23	C U1
	lda	r16, -40(r16)
	br	L(cj2)

L(gt2):	ldq	r0, 16(r17)
	ldq	r1, 24(r17)
	mulq	r24, r19, r6	C U1
	umulh	r24, r19, r22	C U1
	ldq	r2, 32(r17)
	mulq	r3, r19, r7	C U1
	umulh	r3, r19, r23	C U1
	ldq	r3, 40(r17)
	lda	r18, -4(r18)
	lda	r17, 48(r17)
	mulq	r0, r19, r4	C U1
	umulh	r0, r19, r20	C U1
	lda	r16, -8(r16)
	bgt	r18, L(gt6)

	mulq	r1, r19, r5	C U1
	br	L(cj6)

L(gt6):	ldq	r0, 0(r17)
	mulq	r1, r19, r5	C U1
	br	L(L2)


L(b1):	bgt	r18, L(gt1)

	mulq	r24, r19, r7	C U1
	umulh	r24, r19, r23	C U1
	lda	r16, -48(r16)
	br	L(cj1)

L(gt1):	ldq	r0, 8(r17)
	ldq	r1, 16(r17)
	ldq	r2, 24(r17)
	mulq	r24, r19, r7	C U1
	umulh	r24, r19, r23	C U1
	ldq	r3, 32(r17)
	lda	r18, -4(r18)
	lda	r17, 40(r17)
	mulq	r0, r19, r4	C U1
	umulh	r0, r19, r20	C U1
	lda	r16, -16(r16)
	bgt	r18, L(gt5)

	mulq	r1, r19, r5	C U1
	umulh	r1, r19, r21	C U1
	mulq	r2, r19, r6	C U1
	br	L(cj5)

L(gt5):	ldq	r0, 0(r17)
	mulq	r1, r19, r5	C U1
	umulh	r1, r19, r21	C U1
	ldq	r1, 8(r17)
	mulq	r2, r19, r6	C U1
	br	L(L1)


L(b0):	ldq	r1, 8(r17)
	ldq	r2, 16(r17)
	ldq	r3, 24(r17)
	lda	r17, 32(r17)
	lda	r16, -24(r16)
	mulq	r24, r19, r4	C U1
	umulh	r24, r19, r20	C U1
	bgt	r18, L(gt4)

	mulq	r1, r19, r5	C U1
	umulh	r1, r19, r21	C U1
	mulq	r2, r19, r6	C U1
	umulh	r2, r19, r22	C U1
	mulq	r3, r19, r7	C U1
	br	L(cj4)

L(gt4):	ldq	r0, 0(r17)
	mulq	r1, r19, r5	C U1
	umulh	r1, r19, r21	C U1
	ldq	r1, 8(r17)
	mulq	r2, r19, r6	C U1
	umulh	r2, r19, r22	C U1
	ldq	r2, 16(r17)
	mulq	r3, r19, r7	C U1
	br	L(L0)

C *** MAIN LOOP START ***
	ALIGN(16)
L(top):	mulq	r0, r19, r4	C U1
	subq	r8, r28, r8
L(L3):	umulh	r0, r19, r20	C U1
	cmpult	r8, r5, r28
	ldq	r0, 0(r17)
	subq	r8, r5, r8
	addq	r21, r28, r28
	stq	r8, 0(r16)

	mulq	r1, r19, r5	C U1
	subq	r8, r28, r8
L(L2):	umulh	r1, r19, r21	C U1
	cmpult	r8, r6, r28
	ldq	r1, 8(r17)
	subq	r8, r6, r8
	addq	r22, r28, r28
	stq	r8, 8(r16)

	mulq	r2, r19, r6	C U1
	subq	r8, r28, r8
L(L1):	umulh	r2, r19, r22	C U1
	cmpult	r8, r7, r28
	ldq	r2, 16(r17)
	subq	r8, r7, r8
	addq	r23, r28, r28
	stq	r8, 16(r16)

	mulq	r3, r19, r7	C U1
	subq	r8, r28, r8
L(L0):	umulh	r3, r19, r23	C U1
	cmpult	r8, r4, r28
	ldq	r3, 24(r17)
	subq	r8, r4, r8
	addq	r20, r28, r28
	stq	r8, 24(r16)

	lda	r18, -4(r18)
	lda	r17, 32(r17)
	lda	r16, 32(r16)
	bgt	r18, L(top)
C *** MAIN LOOP END ***

	mulq	r0, r19, r4	C U1
	subq	r8, r28, r8
L(cj7):	umulh	r0, r19, r20	C U1
	cmpult	r8, r5, r28
	subq	r8, r5, r8
	addq	r21, r28, r28
	stq	r8, 0(r16)
	mulq	r1, r19, r5	C U1
	subq	r8, r28, r8
L(cj6):	umulh	r1, r19, r21	C U1
	cmpult	r8, r6, r28
	subq	r8, r6, r8
	addq	r22, r28, r28
	stq	r8, 8(r16)
	mulq	r2, r19, r6	C U1
	subq	r8, r28, r8
L(cj5):	umulh	r2, r19, r22	C U1
	cmpult	r8, r7, r28
	subq	r8, r7, r8
	addq	r23, r28, r28
	stq	r8, 16(r16)
	mulq	r3, r19, r7	C U1
	subq	r8, r28, r8
L(cj4):	umulh	r3, r19, r23	C U1
	cmpult	r8, r4, r28
	subq	r8, r4, r8
	addq	r20, r28, r28
	stq	r8, 24(r16)
	subq	r8, r28, r8
L(cj3):	cmpult	r8, r5, r28
	subq	r8, r5, r8
	addq	r21, r28, r28
	stq	r8, 32(r16)
	subq	r8, r28, r8
L(cj2):	cmpult	r8, r6, r28
	subq	r8, r6, r8
	addq	r22, r28, r28
	stq	r8, 40(r16)
	subq	r8, r28, r8
L(cj1):	cmpult	r8, r7, r28
	subq	r8, r7, r8
	addq	r23, r28, r28
	stq	r8, 48(r16)
	subq	r8, r28, r0
	ret	r31, (r26), 1

EPILOGUE()
ASM_END()
