; kate:  indent-width 4; tab-width 4

;************************************************************************
;*																		*
;* 		I N T   M A T H   M O D U L E									*
;*		32-bit Signed Integer Arithmetic Functions						*
;*																		*
;*		COPYRIGHT AND WARRANTY NOTE										*
;*		This software is copyright (C) 2023, Mark A. Haidekker			*
;*		Software may be used and distributed under the 					*
;*		GNU GPL v 3.0 or later											*
;*		No warranty - use entirely at your own risk						*
;*																		*
;*		See https://www.gnu.org/licenses/gpl-3.0.en.html				*
;*																		*
;************************************************************************

; NOTE --- in the following section, we decide which processor include
;		file to load. We can use gpasm's processor symbol, such as
;		__18F8520, which is defined through the -p command line option.

	IFDEF __18F8520
		messg "PIC18f8520 selected"
		TITLE "INTMATH_S FOR 18F8520"
		#include p18f8520.inc
	ENDIF

	IFDEF __18F4520
		messg "PIC18f4520 selected"
		TITLE "INTMATH_S FOR 18F4520"
		#include p18f4520.inc
	ENDIF

	IFDEF __18F13K50
		messg "PIC18f13k50 selected"
		TITLE "INTMATH_S FOR 18F13k50"
		#include p18f13k50.inc
	ENDIF

	IFDEF __18F2520
		messg "PIC18f2520 selected"
		TITLE "INTMATH_S FOR 18F2520"
		#include p18f2520.inc
	ENDIF


;	Clock: Any


; this module REQUIRES

;	nothing

; this module PROVIDES

	GLOBAL	xreg0,xreg1,xreg2,xreg3,xreg4
	GLOBAL	yreg0,yreg1,yreg2,yreg3
	GLOBAL	zreg0,zreg1,zreg2,zreg3
	GLOBAL	treg0,treg1,treg2,treg3
	GLOBAL	dreg0,dreg1,dreg2,dreg3

	GLOBAL	init_dreg			; Set decade register to 1
	GLOBAL	clear_xreg			; Zero X register
	GLOBAL	clear_x_to_W		; Set xreg to value of W (0...255, unsigned)
	GLOBAL	copyxy				; Copy Xreg into Yreg
	GLOBAL	copyxz				; Copy Xreg into Zreg
	GLOBAL	copyzx				; Copy zreg into Xreg
	GLOBAL	copydx				; Copy Dreg into Xreg
	GLOBAL	copyxt				; Copy Xreg into Treg
	GLOBAL	copyxd				; Copy Xreg into Dreg
	GLOBAL	swapxy				; Exchange X and Y
	GLOBAL	abs_x				; Compute absolute of xreg ( abs(x) -> x )
	GLOBAL	chs_x				; Change sign of xreg ( -x -> x )
	GLOBAL	clamp_x				; Clamp negative values of X so that always X >= 0
	GLOBAL	expand_sign			; Expand sign of 16-bit X to 32 bits
	GLOBAL	xtest0				; Test if x==0, Z-flag set if so
	GLOBAL	xplusy				; Add yreg to xreg:	y + x -> x
	GLOBAL	xplusw				; add WREG to xreg: x + W -> x (all other registers unchanged)
	GLOBAL	xplusplus			; Increment xreg by 1
	GLOBAL	xplusz				; Add zreg to xreg:	z + x -> x
	GLOBAL	tplusx				; Add xreg to treg:	t + x -> t
	GLOBAL	yminusx				; subtract xreg from yreg: y - x -> x
	GLOBAL	xequalsy			; Test x==y, Z if equals
	GLOBAL	xequalsy16			; Workaround, since we never seem to compare anything > 16 bits
	GLOBAL	cmpxy				; Test if xreg >= yreg through dummy x-y. C=1 if yes
	GLOBAL	scmpxy				; Same as above, but for signed ints
	GLOBAL	cmpyz				; Test if yreg >= zreg through dummy y-z. C=1 if yes
	GLOBAL	lshiftx				; Left shift XREG (both signed and unsigned)
	GLOBAL	rshiftx				; Right shift xreg (signed)
	GLOBAL	xtimesw				; Multiply Xreg times W (fast harware mul)
	GLOBAL	sxtimesw			; Multiply signed Xreg times W (fast hardware mul)
	GLOBAL	xtimes10			; Multiply Xreg times 10 (fast harware mul)
	GLOBAL	mulxy				; Multiplication x*y -> x
	GLOBAL	smulxy				; Signed Multiplication x*y -> x
	GLOBAL	subyz				; Subtract Zreg from Yreg: y=y-z
	GLOBAL	divyx				; Division y/x -> x
	GLOBAL	sdivyx				; Signed Division y/x -> x
	GLOBAL	intsqrt				; Unsigned sqrt (xreg) -> xreg
	GLOBAL	x_to_bcd			; Convert xreg into BCD
	GLOBAL	atoi				; Convert a string at [FSR0] to a number in xreg
	GLOBAL	atoi_hex			; Convert a hex string into a value in xreg


;***************************************************


variables	UDATA
;
; A set of 32-bit registers for integer arithmetic
;
xreg0		RES 1
xreg1		RES 1
xreg2		RES 1
xreg3		RES 1
xreg4		RES 1		; If this is nonzero, OVERFLOW

yreg0		RES 1
yreg1		RES 1
yreg2		RES 1
yreg3		RES 1

zreg0		RES 1
zreg1		RES 1
zreg2		RES 1
zreg3		RES 1

treg0		RES 1
treg1		RES 1
treg2		RES 1
treg3		RES 1

dreg0		RES 1
dreg1		RES 1
dreg2		RES 1
dreg3		RES 1
dreg		RES 1			; decade (not mul'd by 10)

temp1		RES	1
temp2		RES	1
sign		RES	1



INTMATH		CODE


;*********************************************************************
;
; Integer arithmetic stuff


init_dreg			; Set decade register to 1
		movlw	0x01
		movwf	dreg0
		clrf	dreg1
		clrf	dreg2
		clrf	dreg3
		return

clear_xreg			; Zero X register
		clrf	xreg0
		clrf	xreg1
		clrf	xreg2
		clrf	xreg3
		clrf	xreg4
		return

clear_x_to_W			; Set xreg0 to W, clear all other bytes
		movwf	xreg0
		clrf	xreg1
		clrf	xreg2
		clrf	xreg3
		clrf	xreg4
		return


copyxy					; Copy Xreg into Yreg
		movff	xreg0,yreg0
		movff	xreg1,yreg1
		movff	xreg2,yreg2
		movff	xreg3,yreg3
		return


copyxz					; Copy Xreg into Zreg
		movff	xreg0,zreg0
		movff	xreg1,zreg1
		movff	xreg2,zreg2
		movff	xreg3,zreg3
		return


copyzx					; ... and vice versa
		movff	zreg0,xreg0
		movff	zreg1,xreg1
		movff	zreg2,xreg2
		movff	zreg3,xreg3
		return


copydx					; Copy Dreg into Xreg
		movff	dreg0,xreg0
		movff	dreg1,xreg1
		movff	dreg2,xreg2
		movff	dreg3,xreg3
		return


copyxd					; Copy Xreg into Dreg
		movff	xreg0,dreg0
		movff	xreg1,dreg1
		movff	xreg2,dreg2
		movff	xreg3,dreg3
		return


copyxt					; Copy Xreg into Treg
		movff	xreg0,treg0
		movff	xreg1,treg1
		movff	xreg2,treg2
		movff	xreg3,treg3
		return


swapxy					; Exchange X and Y
		movf	xreg0,W
		movff	yreg0,xreg0
		movwf	yreg0
		movf	xreg1,W
		movff	yreg1,xreg1
		movwf	yreg1
		movf	xreg2,W
		movff	yreg2,xreg2
		movwf	yreg2
		movf	xreg3,W
		movff	yreg3,xreg3
		movwf	yreg3
		return

abs_x					; Compute absolute value of x ( abs(x) -> x )
		btfsc	xreg3,7		; Negative sign?
		bra		chs_x		; Yes, compute two's complement
		return				; No, do nothing.

chs_x					; Change sign of xreg ( -x -> x) two's complement
		comf	xreg0, F		; One's complement
		comf	xreg1, F
		comf	xreg2, F
		comf	xreg3, F
		movlw	0x00
		bsf		STATUS,C
		addwfc	xreg0, F		; Add one to get two's complement
		addwfc	xreg1, F
		addwfc	xreg2, F
		addwfc	xreg3, F 
		return

clamp_x						; Clamp negative values of xreg to zero
		btfss	xreg3,7		; Negative sign?
		return				; No... nothing to do
		bra		clear_xreg	; Yes... set to zero.


expand_sign					; Expand sign of 16-bit X to 32 bits
		clrf	xreg2
		clrf	xreg3
		btfss	xreg1,7		; 16-bit signed short xreg negative?
		return				; No, exit as-is
		setf	xreg2		; Make 0xff from 0x00
		setf	xreg3		; Now all-1 in xreg2,3
		return
	

xtest0					; Test if x==0, Z if so
		movf	xreg0,W
		iorwf	xreg1,W
		iorwf	xreg2,W
		iorwf	xreg3,W
		return

xplusplus				; Increment xreg by 1
		movlw	0x01
xplusw					; Add W to xreg:	x + W -> x
		addwf	xreg0, F
		movlw	0x00
		addwfc	xreg1, F
		addwfc	xreg2, F
		addwfc	xreg3, F
		return

xplusy					; Add yreg to xreg:	y + x -> x
		movf	yreg0,W
		addwf	xreg0,F
		movf	yreg1,W
		addwfc	xreg1,F
		movf	yreg2,W
		addwfc	xreg2,F
		movf	yreg3,W
		addwfc	xreg3,F
		return

xplusz					; Add zreg to xreg: z + x -> x
		movf	zreg0,W
		addwf	xreg0,F
		movf	zreg1,W
		addwfc	xreg1,F
		movf	zreg2,W
		addwfc	xreg2,F
		movf	zreg3,W
		addwfc	xreg3,F
		return

tplusx					; Accumulate in treg, t + x -> t
		movf	xreg0,W
		addwf	treg0,F
		movf	xreg1,W
		addwfc	treg1,F
		movf	xreg2,W
		addwfc	treg2,F
		movf	xreg3,W
		addwfc	treg3,F
		return

yminusx					; subtract xreg from yreg: y - x -> x
		movf	xreg0,W
		subwf	yreg0,W
		movwf	xreg0
		movf	xreg1,W
		subwfb	yreg1,W
		movwf	xreg1
		movf	xreg2,W
		subwfb	yreg2,W
		movwf	xreg2
		movf	xreg3,W
		subwfb	yreg3,W
		movwf	xreg3
		return

xequalsy16				; Should work the same way, but signals through Z flag

cmpxy16					; Temporary workaround
		movf	yreg0,W
		subwf	xreg0,W
		movf	yreg1,W
		subwfb	xreg1,W	
		return

xequalsy				; Test if X==Y, signal Z=1 if so, and Z=0 if X!=Y
		movf	xreg0, W
		xorwf	yreg0, W
		bnz		xneqy
		movf	xreg1, W
		xorwf	yreg1, W
		bnz		xneqy
		movf	xreg2, W
		xorwf	yreg2, W
		bnz		xneqy
		movf	xreg3, W
		xorwf	yreg3, W
		bnz		xneqy
xeqy
		bsf		STATUS, Z
		return
xneqy
		bcf		STATUS, Z
		return


cmpxy					; Test if xreg >= yreg through dummy x-y, return C=1 if yes
		movf	yreg0,W
		subwf	xreg0,W
		movf	yreg1,W
		subwfb	xreg1,W
		movf	yreg2,W
		subwfb	xreg2,W		; WARNING: If it's 16 bit or lower, these will be zero
		movf	yreg3,W		; That means 0 - 0 = 0, Z=1, unless C=1, in which case Z=0
		subwfb	xreg3,W
		return

; For that dratted equals case, do it the other way 'round:

cmpyx						; Test if yreg >= xreg through dummy y-x, return C=1 if yes
		movf	xreg0,W		; Implicitly, yreg < xreg? C=	0 if yes.
		subwf	yreg0,W
		movf	xreg1,W
		subwfb	yreg1,W
		movf	xreg2,W
		subwfb	yreg2,W
		movf	xreg3,W
		subwfb	yreg3,W
		return


; signed compare. If both signs are positive, use cmpxy.
; Different sign: The negative number is always smaller.
; Both negative: Invert the result of cmpxy

scmpxy
		movf	xreg3,W
		iorwf	yreg3,W
		btfss	WREG,7		; Test for both signs positive
		bra		cmpxy		; Yes, use unsigned compare
		movf	xreg3,W
		andwf	yreg3,W		
		btfss	WREG,7		; Test for both signs negative
		bra		scmpxy_neq	; Not equal signs. Handle separately
		call	swapxy		; By swapping the comparands,  we may use
		bra		cmpxy		; the result of cmpxy directly
scmpxy_neq				; Unequal signs. If X is positive, exit C=1
		btfss	xreg3,7		; test sign of X
		bra		x.gt.y		; X positive, must be > Y
		bcf		STATUS,C	; X negative, must be < Y
		return
x.gt.y
		bsf		STATUS,C
		return


; Left- and right shifts. Often used for multiplication and division
; with/by 2. Left shift is the same in signed and unsigned versions
; Both functions leave W intact.


xtimes16						; Multiply xreg by 16 by performing 4 left shifts. 
		rcall	lshiftx			; WREG must remain unaffected!
		rcall	lshiftx
		rcall	lshiftx			; For the 4th shift, drop into lshiftx

lshiftx
		bcf		STATUS, C
		rlcf	xreg0, F
		rlcf	xreg1, F
		rlcf	xreg2, F
		rlcf	xreg3, F
		return

rshiftx					; Signed version (we don't offer unsigned)
		bcf		STATUS, C
		btfsc	xreg3, 7	; Test the sign
		bsf		STATUS, C	; and move the sign into C (C=1 if negative)
		rrcf	xreg3, F
		rrcf	xreg2, F
		rrcf	xreg1, F
		rrcf	xreg0, F
		return



; Multiply X by W. Since this is
; an asymmetric 32 bit by 8 bit operation, we may
; take advantage of the hardware multiplier
; Note: Destroys temp1 and yreg

xtimes10				; Multiply Xreg times 10
		movlw	0x0a

xtimesw					; Multiply Xreg times W

		movwf	temp1

		call	copyxy
		call	clear_xreg	; To be used as summation area

		movf	temp1,W		; LSB (byte 0)
		mulwf	yreg0
		movf	PRODL,W
		movwf	xreg0
		movf	PRODH,W
		movwf	xreg1

		movf	temp1,W		; NSB (byte 1)
		mulwf	yreg1
		movf	PRODL,W
		addwf	xreg1,F
		movf	PRODH,W
		addwfc	xreg2,F

		movf	temp1,W		; NSB (byte 2)
		mulwf	yreg2
		movf	PRODL,W
		addwf	xreg2,F
		movf	PRODH,W
		addwfc	xreg3,F

		movf	temp1,W		; MSB (byte 3 and overflow)
		mulwf	yreg3
		movf	PRODL,W
		addwf	xreg3,F
		movf	PRODH,W
		addwfc	xreg4,F
		return


sxtimesw				; Signed version: X is signed, W is unsigned
		btfss	xreg3,7		; Negative?
		bra		xtimesw		; No, go multiply as-is
		movwf	temp1		; Save the multiplicand
		call	chs_x		; Invert x's sign
		movf	temp1,W		; Restore the multiplicand
		call	xtimesw		; Multiply positive numbers
		call	chs_x		; and restore the sign
		return


;
; Multiplication x*y -> x
; Hardware 8x8 multiplication requires a lot of overhead
; for full 32-bit multiplication, so we use the ancient
; add/divide algorithm.
; Destroys zreg.
;

mulxy

		call	copyxz
		call	clear_xreg

mulxy_testz
		movf	zreg0,W
		iorwf	zreg1,W			; Is z==0?
		iorwf	zreg2,W
		iorwf	zreg3,W
		bz		mulxy_exit		; If yes, multiplication is done.

		btfsc	zreg0,0			; Is Z an even number?
		bra		mulxy_add

		bcf		STATUS,C		; If yes...
		rrcf	zreg3
		rrcf	zreg2			; ... do z = z/2 ...
		rrcf	zreg1
		rrcf	zreg0
		bcf		STATUS,C
		rlcf	yreg0
		rlcf	yreg1			; ... and y = y*2
		rlcf	yreg2
		rlcf	yreg3

mulxy_add
		call	xplusy			; Now do x = x+y
		movlw	0x01
		subwf	zreg0			; and z = z-1
		movlw	0x00
		subwfb	zreg1
		subwfb	zreg2
		subwfb	zreg3
		bra		mulxy_testz		; and repeat until Z==0

mulxy_exit
		return


; For a signed multiplication, we need to make both multiplicands
; positive, then we may use unsigned mul.

smulxy

		clrf	sign
		movf	xreg3,W		; Equal signs will yield a positive result
		xorwf	yreg3,W
		btfsc	WREG,7
		bsf		sign,0		; Sign<0> contains the final sign
		call	abs_x		; Eliminate X's sign
		call	swapxy		; x*y = y*x
		call	abs_x		; Eliminate second operand's sign
		call	mulxy		; This is a positive number
		btfsc	sign,0		; Should the result be negative?
		call	chs_x		; Make it so.
		return


; Division y/x -> x
; with the remainder in y after the operation
; This is the classical shift'n'subtract approach,
; equivalent to the multiplication above.
; Since this algorithm cannot overflow, we use xreg4 as counter
;
; Here is how it works:
;
;	1.	Test x==0.		If so, exit with error (C=1)
;	2. 	z = x			Copy x into z -- we use x as work register
;		t = 1			Counter (we use xreg4 for this)
;	3. 	while (y >= z)		This while loop aligns the highest bits of y and z
;	4. 		z = z << 1		Multiply z by 2
;			t = t+1			Increment counter
;		endwhile
;	5. 	x = x<<1
;	6. 	If y >= z 
;			y = y-z
;			x = x+1
;		endif
;	7. 	z = z >> 1
;	8.	t = t-1
;	9.	if t>0 goto 5
;


cmpyz					; Test if yreg >= zreg through dummy y-z
		movf	zreg0,W		; Return C=1 if yes
		subwf	yreg0,W
		movf	zreg1,W
		subwfb	yreg1,W
		movf	zreg2,W
		subwfb	yreg2,W
		movf	zreg3,W
		subwfb	yreg3,W
		return

subyz					; Subtract Zreg from Yreg: y=y-z
		movf	zreg0,W
		subwf	yreg0,F
		movf	zreg1,W
		subwfb	yreg1,F
		movf	zreg2,W
		subwfb	yreg2,F
		movf	zreg3,W
		subwfb	yreg3,F
		return


divyx
		call	xtest0			; Test for division by zero
		bnz		divyx_prep		; This is really needed, because the
		goto	divby0			; shift loop may HANG if xreg==0
divyx_prep
		call	copyxz
		call	clear_xreg		; Also sets xreg4=0
		incf	xreg4			; Offset counter by 1 to get easy comp at end

divyx_shift
		call	cmpyz			; Test yreg >= zreg
		bnc		divyx_main		; Stop shifting if yreg < zreg
		incf	xreg4			; increment counter
		movlw	.34				; There may be a rare case where this loop hangs.
		cpfslt	xreg4			; Since we can never have more than 32 shifts,
		bra		divby0			; drop out if more than 32 shift operations.
		bcf		STATUS,C
		rlcf	zreg0
		rlcf	zreg1			; z = z*2
		rlcf	zreg2
		rlcf	zreg3
		bra		divyx_shift

divyx_main
		bcf		xreg4,7			; We have a maximum of 32 shifts; use bit 7 as "pseudo-carry"
		call	cmpyz			; Test yreg >= zreg
		bnc		divyx_01
		bsf		xreg4,7			; save the test result for later
		call	subyz			; If yreg >= zreg do y = y-z
divyx_01
		bcf		STATUS,C
		rlcf	xreg0
		rlcf	xreg1			; Either case, do x = x*2
		rlcf	xreg2
		rlcf	xreg3
		btfss	xreg4,7			; Retrieve result of above y >=z test
		bra		divyx_02
		bsf		STATUS,C		; Use carry as 1
		movlw	0x00			; Only if y>=z above,...
		addwfc	xreg0,F
		addwfc	xreg1,F			; ...increment Xreg by 1
		addwfc	xreg2,F
		addwfc	xreg3,F
divyx_02
		bcf		xreg4,7
		bcf		STATUS,C
		rrcf	zreg3
		rrcf	zreg2			; and finally do z = z/2
		rrcf	zreg1
		rrcf	zreg0
	
		decfsz	xreg4			; loop until counter reaches zero
		bra		divyx_main
		return
	

divby0	
		call	clear_xreg		; Uck. n/0 = 0, bad assumption. Need to correct.
		goto	copyxy


; For a signed division, we use the same trick as for
; multiplication: We isolate the sign beforehand.
; Note that there are different definitions, e.g., Euclidean division,
; under which the remainder *has* to be positive. Here, we simply
; assign the same sign to quotient and remainder.

sdivyx

		clrf	sign
		movf	xreg3,W		; Equal signs will yield a positive result
		xorwf	yreg3,W
		btfsc	WREG,7
		bsf		sign,0		; Sign<0> contains the final sign
		call	abs_x		; Eliminate X's sign
		call	swapxy
		call	abs_x		; Eliminate second operand's sign
		call	swapxy		; y/x != x/y
		call	divyx		; This is now a positive number
		btfss	sign,0		; Should the result be negative?
		return				; No, just exit
		call	swapxy		; Result and remainder are both negative
		call	chs_x		; change sign of remainder
		call	swapxy		; Get division result back into X
		call	chs_x		; change sign of result
		return




;---------------------------------------------------------
; Integer square root. This algorithm is based on
; Jack W. Crenshaw's algorithm, which is well documented
; on the web. It goes back to Newton-Raphson's method in which
; the task x = sqrt(a) leads to the roots of x^2 - a = 0 
; and the iterative approach x[n+1] = x[n]/2 + a/2x[n]
;
; Here, the argument is passed in xreg and the result
; returned in xreg. zreg is used to form the result,
; and treg is the bit shifter. dreg remains untouched.

intsqrt:
		call	xtest0			; To be on the safe side, catch the case of x=0
		bnz		intsq_notzero	; and exit with sqrt() = 0
		return					; This is probably worth a few cycles

intsq_notzero
		clrf	treg0			; treg is the power 2^(2k)
		clrf	treg1
		clrf	treg2
		movlw	0x40
		movwf	treg3
		clrf	zreg0			; zreg is used to accumulate the result
		clrf	zreg1			; To save a few cycles, we'll make use of
		clrf	zreg2			; the fact that the result *must* be a
		clrf	zreg3			; 16-bit value

		; We begin by finding the largest power of four (!)
		; that is smaller or equal to the argument -> treg
		; This value also reflects the number of iterations K as
		; K = log2(treg)+1, and we always have K < 32.

intsq_pw4find
		bcf		STATUS, C		; begin by a test divide-by-4
		rrcf	treg3, W		; by setting yreg = treg >> 2
		movwf	yreg3			; leave the argument in xreg untouched
		rrcf	treg2, W
		movwf	yreg2
		rrcf	treg1, W
		movwf	yreg1
		rrcf	treg0, W
		movwf	yreg0
		bcf		STATUS, C
		rrcf	yreg3, F
		rrcf	yreg2, F
		rrcf	yreg1, F
		rrcf	yreg0, F

		call	cmpyx			; Test if yreg >= xreg, C=1 if yes ... meaning bitsft > arg
		bnc		intsq_loop		; Test-shifted treg was <= arg, treg is ready

		movff	yreg0, treg0	; Test-shifted value was still > arg, 
		movff	yreg1, treg1	; secure this value and try next lower power of 4
		movff	yreg2, treg2
		movff	yreg3, treg3
		bra		intsq_pw4find

		; The iterative loop: Repeat the loop while treg is not zero
		; Up to this point, the argument in xreg has not been touched

intsq_loop
		movf	treg0, W		; Test treg==0
		iorwf	treg1, W
		iorwf	treg2, W
		iorwf	treg3, W
		bz		intsq_done		; treg is zero -> all iterations completed

		; Next, we need bitsft+result (treg+zreg) in yreg
		; (note, result is maximally 2 bytes, but we don't save anything)

		movf	treg0, W
		addwf	zreg0, W
		movwf	yreg0
		movf	treg1, W
		addwfc	zreg1, W
		movwf	yreg1
		movf	treg2, W
		addwfc	zreg2, W
		movwf	yreg2
		movf	treg3, W
		addwfc	zreg3, W
		movwf	yreg3

		call	cmpxy			; test if xreg >= (bitsft+result), C=1 if yes
		bnc		intsq_shiftit	; If not, skip the addition section

		; If xreg >= (bitsft+result), we need to perform
		;		xreg = xreg - (bitsft+result) ... subtract Y from X
		;		zreg = zreg + (bitsft<<1)

		movf	yreg0, W		; X = X - Y, this is
		subwf	xreg0, F		; the opposite of xminusy
		movf	yreg1, W
		subwfb	xreg1, F		; yields xreg - (bitsft+result)
		movf	yreg2, W
		subwfb	xreg2, F
		movf	yreg3, W
		subwfb	xreg3, F		; At this point, yreg is free

		bcf		STATUS, C		; bitsft << 1 needs to be a separate
		rlcf	treg0, W		; operation, because we need the C flag
		movwf	yreg0
		rlcf	treg1, W
		movwf	yreg1
		rlcf	treg2, W
		movwf	yreg2
		rlcf	treg3, W
		movwf	yreg3			; We now have (bitsft << 1) in yreg

		movf	yreg0, W
		addwf	zreg0, F		; and add this to the result accumulator
		movf	yreg1, W		; in zreg
		addwfc	zreg1, F
		movf	yreg2, W		; Done here (16-bit), but do it anyway
		addwfc	zreg2, F		; for consistency
		movf	yreg3, W
		addwfc	zreg3, F

		; Unconditional (every iteration):
		;		result = result >> 1
		;		bitsft = bitsft >> 2

intsq_shiftit
		bcf		STATUS, C
		rrcf	zreg3, F		; Divide result by 2
		rrcf	zreg2, F
		rrcf	zreg1, F
		rrcf	zreg0, F

		bcf		STATUS, C
		rrcf	treg3, F		; Divide bitsft by 4
		rrcf	treg2, F
		rrcf	treg1, F
		rrcf	treg0, F
		bcf		STATUS, C
		rrcf	treg3, F
		rrcf	treg2, F
		rrcf	treg1, F
		rrcf	treg0, F

		bra		intsq_loop		; proceed to next iteration

		; We are done with iterations. The result is in zreg, 
		; but we can perform a simple trick to get the rounded
		; instead of the truncated value. All we need to do is
		; increment the result by one if the result is < xreg

intsq_done
		call	copyxy				; Stow away for comparison
		movff	zreg0, xreg0		; Copy truncated result to xreg
		movff	zreg1, xreg1
		movff	zreg2, xreg2
		movff	zreg3, xreg3
		call	cmpxy				; test if result < yreg, C=0 if yes
		bc		intsq_exit			; If result >= yreg, no rounding
		movlw	0x01
		addwf	xreg0, F			; Otherwise, increment result by 1
		movlw	0x00
		addwfc	xreg1, F
		addwfc	xreg2, F
		addwfc	xreg3, F
intsq_exit
		return


;---------------------------------------------------------
; Convert xreg to BCD. Modifies xreg and destroys treg.
; see https://en.wikipedia.org/wiki/Double_dabble
;
; This 32-bit version accepts positive values up to 0x 05F5 E0FF (99999999)
;
; The improved 2016 version has the following changes:
; 	1) We check for the limited range and branch into a faster 16-bit version
;	2) and an even faster 8-bit version is available, too (more code to improve speed)
;	3) The decimal adjust function has been tightened and no longer uses temp3
;	4) It no longer relies on FSR0

x_to_bcd				; Convert xreg to BCD

		movff	xreg0, treg0		; We start out with the hex number in treg
		movff	xreg1, treg1		; and xreg=0. Each nybble of xreg will be
		movff	xreg2, treg2		; a power of 10
		movff	xreg3, treg3
		call	clear_xreg
		movf	treg3, W
		iorwf	treg2, W			; Check if 16 bits are sufficient
		bz		x_to_bcd_16bit
		movlw	.32					; total of 32 bits to shift
		movwf	temp1

x_to_bcd_1a							; Loop over 32 bits
		movlw	0x04				; 4 bytes
		movwf	temp2				; counter for 4 bytes
x_to_bcd_2a							; Inner loop: Check 8 nybbles and add 3 if value is >=5
		rcall	x_to_bcd_check_tetrade
		movf	xreg0, W			; Cycle 4 bytes so we can access all bytes through xreg0
		movff	xreg1, xreg0
		movff	xreg2, xreg1
		movff	xreg3, xreg2
		movwf	xreg3
		decfsz	temp2
		bra		x_to_bcd_2a

		bcf		STATUS,C		; shift one bit from t into x
		rlcf	treg0
		rlcf	treg1
		rlcf	treg2
		rlcf	treg3
		rlcf	xreg0
		rlcf	xreg1
		rlcf	xreg2
		rlcf	xreg3

		decfsz	temp1					; Bit count done?
		bra		x_to_bcd_1a
		return

x_to_bcd_16bit						; The reduced 16-bit version
		iorwf	treg1, W			; Maybe we get really lucky?
		bz		x_to_bcd_8bit
		movlw	.16					; total of 16 bits to shift
		movwf	temp1
x_to_bcd_1b							; Loop over 16 bits
		rcall	x_to_bcd_check_tetrade		; execute on xreg0
		movf	xreg0, W
		movff	xreg1, xreg0		; Swap xreg0, xreg1
		movwf	xreg1
		rcall	x_to_bcd_check_tetrade		; execute on xreg1
		movf	xreg0, W
		movff	xreg1, xreg0		; Swap back xreg0, xreg1
		movwf	xreg1
		rlcf	treg0				; shift one bit from t into x
		rlcf	treg1
		rlcf	xreg0
		rlcf	xreg1
		rlcf	xreg2			; no need to test this tetrade, because it cannot be > 6

		decfsz	temp1					; Bit count done?
		bra		x_to_bcd_1b
		return

; A further reduced version with only 8 bits

x_to_bcd_8bit
		movlw	.8					; total of 8 bits to shift
		movwf	temp1
x_to_bcd_1c							; Loop over 16 bits
		rcall	x_to_bcd_check_tetrade		; execute on xreg0
		rlcf	treg0				; shift one bit from t into x
		rlcf	xreg0
		rlcf	xreg1
		decfsz	temp1				; Bit count done?
		bra		x_to_bcd_1c
		return



; This is the addition part. Go over all tetrades (decimal places)
; and add 3 if it is 5 or higher (here done for 2 tetrades only)
; This type of addition, *followed* by a left shift (x2) ensures
; that any value more than 4 bits carries correctly. Examples:
;		3 (+0) = 3 ->  6
;		4 (+0) = 4 ->  8
;		5 (+3) = 8 -> 10 (in hex, interpreted as decimal)
;		6 (+3) = 9 -> 12
;		7 (+3) = A -> 14
; Note the relationship to the DAW instruction, which adds 6 if a nybble is 0A
; or greater -- this corresponds to *after* the shift. Here, we instead
; perform the adjustment *before* the shift.
;
; This improved version adds first and then tests; since we can check a single bit,
; this is much faster than testing first and then adding (new: 2016)

x_to_bcd_check_tetrade
		movlw	0x33			; Test-add 3 to both tetrades
		addwf	xreg0, F		; If any tetrade was 5 or higher, it is now 8 or higher
		movlw	0x03			; Pre-load W with 03 for subtraction
		btfss	xreg0, 3		; Is it 8 or higher?
		subwf	xreg0, F		; No, take back the initial addition, i.e., subtract 3
		movlw	0x30			; Do the same for the high nybble
		btfss	xreg0, 7		; is it 8 or higher?
		subwf	xreg0, F		; No, take back the initial addition
		return					; And the adjusted result is in xreg0


;********************************************


;
; atoi gathers ASCII-coded digits from the input line
; and converts them into an integer number. 
; atoi assumes a number starting at [FSR0], which must be set by caller.
; atoi converts only consecutive strings of [0...9] and exits if any
; non-digit character is encountered


atoi				; Convert the data at present location of [FSR0]

		call	clear_xreg
		clrf	sign

		; Multiply current operand (xreg) by 10, then add the new digit
		; If we ran past a decimal point, increment decade counter
		; If we run into a non-digit, exit
		; The first digit may be a '+' or a '-' sign, though,
		; and we skip trailing blanks.

atoi_trailing
		movlw	A' '
		cpfseq	INDF0			; Blank space?
		bra		atoi_chk_sign	; No, proceed to checking for a sign
		movf	POSTINC0, W		; Advance FSR0
		bra		atoi_trailing	; and re-test
atoi_chk_sign
		movlw	A'+'			; A plus sign may immediately precede the number
		cpfseq	INDF0
		bra		atoi_minus
		movf	POSTINC0, W		; Advance FSR0
		bra		atoi_get_next	; but then it must be digits
atoi_minus
		movlw	A'-'			; A minus sign is allowed
		cpfseq	INDF0
		bra		atoi_get_next	; But this exhausts it, must be digits now.
		movf	POSTINC0, W		; Advance FSR0
		bsf		sign, 7			; but set a negative sign

atoi_get_next
		movlw	0x3a
		cpfslt	INDF0			; Larger than '9' digit?
		bra		atoi_exit
		movlw	0x2f
		cpfsgt	INDF0			; Smaller than '0' digit?
		bra		atoi_exit		; If so, exit and keep pointing at it

		call	xtimes10		; New digit means the old value needs to be moved one position
		movf	POSTINC0,W		; Get new digit and point to next character in string
		xorlw	0x30			; is ['0'...'9'], convert to 0...9
		addwf	xreg0,F
		movlw	0
		addwfc	xreg1,F
		addwfc	xreg2,F
		addwfc	xreg3,F
		addwfc	xreg4,F
		bra		atoi_get_next

atoi_exit						; Last: If this was preceded by a sign,
		btfsc	sign, 7			; then we need to change the sign and
		call	chs_x			; make it a negative number
		return



; This version converts a hex value to integer (unsigned, always positive)

atoi_hex			; Convert the data at present location of [FSR0]

		call	clear_xreg

		; Multiply current operand (xreg) by 16, then add the new digit
		; (skip trailing blanks as does atoi)

atoh_trailing
		movlw	A' '
		cpfseq	INDF0			; Blank space?
		bra		atoh_get_next	; No, proceed to checking for a sign
		movf	POSTINC0, W		; Advance FSR0
		bra		atoh_trailing	; and re-test

		; Convert hex ASCII. We can't get around using one scratch byte, so
		; let's use the sign byte; it has no other purpose in atoh anyway.
		; (provided that we leave the input buffer intact)

atoh_get_next
		movf	INDF0, W
		btfsc	WREG, 7				; 0x80...0xFF are totally invalid
		bra		atoh_exit
		movwf	sign				; Temporary storage
		movlw	0x39
		cpfsgt	sign				; Skip for '0'...'9'
		bra		atoh_is_digit		; Jump taken for 39 and below
		movlw	0x3F
		cpfsgt	sign				; We need to separately trap 3A...3F
		bra		atoh_exit			; 3AA...3F are invalid, exit immediately
		movlw	0xDF				; 40...7F, but only 41...46, 61...66 valid
		andwf	sign, F				; Convert to upper case, only 40...5F remain
		movlw	0x40
		xorwf	sign, W				; Gah. No cpfsne available.
		bz		atoh_exit			; Exit for 0x40 and 0x60
		movlw	0x07
		subwf	sign, F				; Now, 0x41...0x5F become 0x3A...0x58
atoh_is_digit						; and we have continuity with 0x00...0x39
		movlw	0x30				; Subtract 0x30. 2F and below turn negative
		subwf	sign, F				; and 30...3F become 00...0F
		bn		atoh_exit			; Negative result -> exit
		movlw	0x10
		cpfslt	sign				; This would be from any illegal ASCII, G, H, I...
		bra		atoh_exit			; If so, exit
		movf	POSTINC0, W			; Now we have a valid and converted hex nybble, advance the pointer
		movf	sign, W				; and load the nybble into W

atoh_shift_digit
		call	xtimes16		; Shift by 4 bits, leaves W intact
		addwf	xreg0,F			; New nybble; might also OR it into xreg0
		movlw	0
		addwfc	xreg1,F
		addwfc	xreg2,F
		addwfc	xreg3,F
		addwfc	xreg4,F
		bra		atoh_get_next

atoh_exit
		return







	end


