diff --git a/arm/macros.inc b/arm/macros.inc index 37e5c93..8b70725 100644 --- a/arm/macros.inc +++ b/arm/macros.inc @@ -13,7 +13,7 @@ rloopindex .req r11 rlooplimit .req r12 @ default register mapping -@ sp .req r13 +@ sp .req r13 @ rsp (return stack pointer) @ lr .req r14 @ pc .req r15 diff --git a/arm/words/double.s b/arm/words/double.s index 09faf6f..61b9867 100644 --- a/arm/words/double.s +++ b/arm/words/double.s @@ -15,17 +15,18 @@ @ ( 1L 1H 2L tos: 2H -- Rem-L Rem-H Quot-L tos: Quot-H ) @------------------------------------------------------------------------------ @ use faster um/mod if divisor is 32-bits -@ TODO: This crashes hard in QEMU, why? -@ cbnz tos, 1f -@ loadtos -@ b umslashmod -@ 1: -@ throw if divisor is zero - ldr r0, [psp, #0] - orrs r0, r0, tos - bne 2f + cbnz tos, 1f + loadtos + cbnz tos, 2f @ throw if divisor is zero throw EDIVZ 2: + bl umslashmod @ (rem quolo quohi) + ldr r0, [psp] @ save quolo + mov r1, #0 @ push 0 for remhi + str r1, [psp] + pushnos r0 @ (rem 0 quolo quohi) + NEXT +1: bl ud_slash_mod NEXT diff --git a/arm/words/muldiv.s b/arm/words/muldiv.s index d081c3b..2722839 100644 --- a/arm/words/muldiv.s +++ b/arm/words/muldiv.s @@ -30,6 +30,10 @@ slashmod: CODEWORD "u/mod", USLASHMOD @ ( u1 u2 -- rem quot ) cbnz tos, uslashmod @ throw if divisor is zero throw EDIVZ + bl uslashmod + NEXT + +@ call with bl uslashmod uslashmod: ldm psp!, {r0} @ Get u1 into a register movs r1, tos @ Back up the divisor in X. @@ -38,4 +42,4 @@ uslashmod: subs r0, r1 @ Compute remainder. subs psp, #4 str r0, [psp] - NEXT + bx lr diff --git a/arm/words/um-slash-mod.s b/arm/words/um-slash-mod.s index 611395e..0abf290 100644 --- a/arm/words/um-slash-mod.s +++ b/arm/words/um-slash-mod.s @@ -7,39 +7,54 @@ SHORT: Divide ud by u1, giving the quotient u3 and the remainder u2. All values and arithmetic are unsigned. An ambiguous condition exists if u1 is zero or if the quotient lies outside the range of a single-cell unsigned integer. */ CODEWORD "um/mod", UMSLASHMOD - @ Same algorithm as if performing the division by hand, just in binary. - @ Inputs: hi:lo = 64-bit dividend, tos = 32-bit divisor - @ Outputs: hi:lo = 64-bit quotient, rem = 32-bit remainder - hi .req r0 - lo .req r1 - rem .req r2 - idx .req r3 -umslashmod: - cbnz tos, 4f @ throw if divisor is zero + cbnz tos, 1f @ throw if divisor is zero throw EDIVZ -4: popnos hi - cmp hi, #0 @ if hi == 0, use the quicker u/mod - beq uslashmod - popnos lo +1: + bl umslashmod + cbz tos, 2f @ if quohi > 0, then quotient is too large + throw ERANGE +2: + loadtos @drop quohi +NEXT + +@ call with bl umslashmod +@ Same algorithm as if performing the division by hand, just in binary. +@ Inputs: hi:lo = 64-bit dividend, tos = 32-bit divisor +@ Outputs: hi:lo = 64-bit quotient, rem = 32-bit remainder +hi .req r0 @ dividend-high +lo .req r1 @ dividend-low +rem .req r2 @ reminder +dsr .req r3 @ divisor +umslashmod: @ ( dndlo dndhi dsr -- rem quolo quohi ) + popnos hi @ load dividend-high + cbnz hi, 3f @ if hi == 0, use the quicker u/mod + push {lr} + bl uslashmod @ ( rem quolo ) + savetos + mov tos, #0 @ ( rem quolo 0 ) + pop {pc} @ return +3: + popnos lo @ load dividend-low @ TODO: could use CLZ to skip shifting through 0 bit prefix bit by bit and save some iterations - mov idx, #64 @ Loop counter for 64 bits + mov dsr, tos @ load divisor + mov tos, #64 @ Loop counter for 64 bits mov rem, #0 @ Initialize remainder to 0 1: lsls lo, lo, #1 @ Shift dividend/quotient low word adcs hi, hi, hi @ Shift dividend/quotient high word into carry (adc is how to lsl by 1 bit with carry) adcs rem, rem, rem @ Shift carry into remainder, also catch the bit shifting off at the top bcs 5f @ If Carry is set, r3 is now effectively 33-bits, force the divisor subtraction - cmp rem, tos @ Can we subtract the divisor? + cmp rem, dsr @ Can we subtract the divisor? blo 2f @ If remainder < divisor, skip -5: sub rem, rem, tos @ remainder -= divisor +5: sub rem, rem, dsr @ remainder -= divisor adds lo, lo, #1 @ Set the lowest bit of quotient -2: subs idx, idx, #1 @ Decrement loop counter +2: subs tos, tos, #1 @ Decrement loop counter bne 1b - cbz hi, 3f @ if hi > 0, then quotient is too large - throw ERANGE -3: pushnos rem - mov tos, lo - .unreq rem - .unreq lo - .unreq hi -NEXT + pushnos rem + pushnos lo + mov tos, hi + bx lr +.unreq rem +.unreq lo +.unreq hi + diff --git a/tests/core2.fr b/tests/core2.fr index 99b3946..498b0e7 100644 --- a/tests/core2.fr +++ b/tests/core2.fr @@ -3,3 +3,11 @@ TESTING rot, -rot T{ 1 2 3 -rot -> 3 1 2 }T T{ 1 2 3 -rot rot -> 1 2 3 }T +TESTING doubles + +\ full 64/64 division +T{ $10000000010. $100000000. ud/mod -> $10. $100. }T +\ escape to 64/32 division +T{ $100000010. $10000. ud/mod -> $10. $10000. }T +\ escape to 32/32 division +T{ $1000010. $1000. ud/mod -> $10. $1000. }T diff --git a/todo.md b/todo.md index 8bc7b25..f67d97c 100644 --- a/todo.md +++ b/todo.md @@ -33,7 +33,8 @@ List of know issues and tasks that need to be done (by area) * [x] implement m-rot.s (see rv) * [x] implement umstar.s (see rv) * [ ] (exiti) likely needs work -* [ ] document dev tool setup +* [ ] we are not using the link register, would it speed things up if it was used to cache DO_NEXT? + (i.e. macro NEXT would do `b lr` instead of `b DO_NEXT`) ## LM4F120 @@ -47,9 +48,7 @@ List of know issues and tasks that need to be done (by area) # RISC-V -* [ ] add readme.md -* [ ] generalize flash dictionary write support (flash.s) -* [ ] generalize eeprom support (eeprom.s) +* [ ] implement native um/mod and optimize for narrow arguments (see ARM) ## CH32V307 * [ ] RAMALLOT reg_shadow differences between 307 ad QEM configuration