;++
; MACRO64$PI.M64 -- Based on original algorithm and C program by Remy Dube.
;
;    © Digital Equipment Corporation 1992, 1993.  All rights reserved.
;
;    Restricted Rights: Use, duplication, or disclosure by the 
;    U.S. Government is subject to restrictions as set forth in 
;    subparagraph (c) (1) (ii) of DFARS 252.227-7013, or in 
;    FAR 52.227-19, or in FAR 52.227-14 Alt. III, as applicable.
;
;    This software is proprietary to and embodies the confidential 
;    technology of Digital Equipment Corporation.  Possession, use, or 
;    copying of this software and media is authorized only pursuant to 
;    a valid written license from Digital or an authorized sublicensor.
;
; This is a complete MACRO-64 program which computes PI to a specified 
; number of digits.  This example is one of three installed with MACRO-64:
;
;	MACRO64$HELLO.M64	- Simple Hello World program
;				  Demonstrates calling standard macros
;	MACRO64$WHAMI.M64	- Program that displays WHAMI IPR 
;				  Demonstrates system calls
;	MACRO64$PI.M64		- Program that computes PI (this program)
;				  Demonstrates general programming with 
;                                 MACRO-64 along with optimization techniques
;
; Digital suggests you become familiar with the concepts illustrated in 
; the MACRO64$HELLO.M64 example and the MACRO64$WHAMI.M64 example before 
; you attempt to understand this example example.
;
;
; The original algorithm implemented by this program is by Remy Dube.  
; This MACRO-64 implementation is adapted from Remy's C implementation.  
; While the algorithm and implementation are reasonably fast and accurate, 
; the intent of this program is to illustrate MACRO-64 programming 
; concepts rather than set world speed or precision records.  This program 
; illustrates the use of a number of optimization techniques, such as loop 
; unrolling and routine inlining.  In several instances, the context in 
; which these techniques are applied causes the technique to yield only a 
; marginal benefit.  Furthermore, due to the decreased development and 
; maintenance costs and increased portability associated with High Order 
; Languages, it would not normally be advisable to write an entire program 
; such as this in assembly language.  Nonetheless, this program serves to 
; illustrate a number of paradigms applicable to efficient programming in 
; MACRO-64.
;
; This program calls the DEC C Runtime Library to perform I/O and for 
; other ancillary tasks.  Note that the DEC C Runtime Library is available 
; on all OpenVMS AXP systems, regardless of whether you have purchased and 
; installed the DEC C compiler product.
;
; To run this program on an OpenVMS AXP system, use the following commands:
;
;	$ macro/alpha_axp/object=pi sys$examples:macro64$pi
;	$ link pi
;	$ run pi
;--


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                         ASSEMBLY-TIME CONSTANTS
;---------------------------------------------------------------------------

; Turn on optimizations and automatic alignment.
	.enable align_code, align_data, peephole, schedule

PRINT_WIDTH	= 100 ; Width of output device
BIGINT_BYTES	= 8   ; Number of bytes in a quadword
BIGINT_SHIFTER	= 3   ; L-Shift this much to convert array index to byte offset
BIGINT_BITS	= BIGINT_BYTES * 8


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                          MODULE VARIABLE DATA
; ---------------------------------------------------------------------------
	$data_section
WEIGHT:		.quad		0
FILE_VAR:	.quad		0
LOG_4:		.g_floating 	0.0
PRECISION:	.quad		0
IMAX:		.quad		0
CLUSTER_SIZE:	.quad		0
CLUSTER_VECT:	.quad		0
CLUSTER_VECT_SIZE:
		.quad		0
ISTOP:		.quad		0
START:		.quad		0
FINISH:		.quad		0
DURATION:	.quad		0
SCRATCH:	.byte 		%repeat(99,<0,>)0

CLK_TCK:	.g_floating 	100.0 ; From DECC$LIBRARY_INCLUDE:TIME.H

G1:		.g_floating 	1.0
G4:		.g_floating	4.0


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                        UTILITY MACROS
;---------------------------------------------------------------------------

;++
; Macro IPOW
; Abstract: Integer power function for assembly-time values.
; Inputs: A - Value to be raised to a power
;         B - Power to which to raise value
; Method: Multiply A times itself B times.
;--
	.macro IPOW A B RESULT
	    RESULT = %repeat(<<B>>,<<A>*>) 1
	.endm IPOW


; The LOOP_UNROLL assembly-time variable is used to control manual loop 
; unrolling:  Use 0 For no loop unrolling, 1 for normal loop unrolling, 
; or a value greater than 1 for a higher, global unroll factor.
LOOP_UNROLL	= 1


;++
; Macro UNROLL
; Abstract:
;   Macro UNROLL is used to assist in in manually unrolling a loop.   
;   It can be beneficial to repeat the body of a loop a number of times 
;   with branch-outs between each repetition to the loop exit.  At 
;   the least, you gain the benefit of a a branch not taken vs. a branch 
;   taken.  While this effect may not gain very much, when loop unrolling 
;   is combined with instruction scheduling (.ENABLE SCHEDULE), you may 
;   be able to schedule portions successive repetitions of the loop in 
;   parallel.  See Appendix B of the MACRO-64 Assembler for OpenVMS AXP 
;   Systems Reference Manual for more information on this optimization 
;   technique.
;
;   To use UNROLL, you must define a macro that expands to the body of the 
;   loop.  Specify the name of this macro with the BODY argument.  You
;   must also specify 2 branch instruction statements: 1 that tests a
;   condition and branches to the beginning of the loop to continue, and
;   another that tests the opposite condition and branches to the end of the
;   loop to exit.  Specify the loop continuation statement with the 
;   CONTINUE argument.  Specify the loop exit statement with the EXIT 
;   argument.  Specify the target label for the loop-continue statement
;   as %%LOOP_START%% and the target for the loop-exit statement as
;   %%LOOP_END%%.  For instance, the continue statement might be a BNE and 
;   the end statement would be BEQ, or vice versa.  Often, you must precede 
;   the branch instruction with a comparison instruction.  If so, specify
;   the comparison instruction statement with the COMPARE argument.
;
;   You can use the TEST_FIRST argument to specify the style of the loop.  
;   Specify 1 to get generate an exit test prior to the first iteration of 
;   the loop.  Specify 0 to always execute the loop at least once.
;
;   You can experiment with different values for the FACTOR argument.  A 
;   FACTOR of 0 or 1 results in a single copy of the loop body -- that is, no 
;   loop unrolling.  FACTORs greater than 1 can improve runtime performance 
;   at the expense of increased code size.  However, depending on the 
;   size of the loop body, too large a FACTOR can cause the repeated loop 
;   body to overflow the instruction cache, thus negatively impacting 
;   performance.
;
;   In addition, if the loop body contains a call to another routine, 
;   it is unlikely that the scheduler (.ENABLE SCHEDULE) will be able to
;   concurrently schedule successive repetitions of the loop body.  As a 
;   result, the benefit realized by unrolling the loop will likely 
;   reduce to the difference in cost between a branch taken and a branch 
;   not taken times the FACTOR you specify.
;
;   Thus, the best loops to unroll are those that are in your critical 
;   performance path, contain a small number of instructions, and do not 
;   call other routines.
;
; Inputs:
;   BODY - Name of the macro that expands to the loop body
;   FACTOR - Specific repetition factor to use when unrolling the loop
;   CONTINUE - A conditional branch instruction statement to continue the loop
;   EXIT - A conditional branch instruction statement to exit the loop
;   COMPARE - An optional comparison instruction to use prior to the CONTINUE 
;     or EXIT instruction
;   TEST_FIRST - non-zero means test the exit condition prior to the 1st 
;     iteration, zero means always execute the loop at least once
;   UNROLL_LOOP (assembly variable) - Controls whether or not to unroll loops
;     or whether to increase the unroll factor for all unrolled loops
;
; Outputs:
;   The body of the loop is repeated FACTOR*UNROLL_LOOP times.
;
; Example usage:
;
;	; First define the loop body as a macro
;	.macro LOOP_BODY
;	    ldq    r0, (r5)
;           lda	   r5, 8(r5)
;           lda    r1, 1(r1)
;	.endm LOOP_BODY
;
;	; Now invoke the UNROLL macro to unroll the loop
;	UNROLL BODY=LOOP_BODY, FACTOR=3, TEST_FIRST=1,	-
;	    COMPARE  = <cmple r0, r2, r22>,		-
;	    CONTINUE = <bne r22, %%LOOP_START%%>,	-
;	    EXIT     = <beq r22, %%LOOP_END%%>
;
;	; The above use of UNROLL would result in the following code 
;	; being generated (where LOOP_START and LOOP_END are unique labels 
;       ; for each invocation of UNROLL):
;
;		cmple r0, r2, r22    ; These 2 instructions are suppressed 
;		beq   r22, LOOP_END  ;  if you specify TEST_FIRST=0
;   	LOOP_START:
;		ldq   r0, (r5)
;		lda   r5, 8(r5)
;	        lda   r1, 1(r1)
;		cmple r0, r2, r22
;		beq   r22, LOOP_END
;		ldq   r0, (r5)
;	        lda   r5, 8(r5)
;		lda   r1, 1(r1)
;		cmple r0, r2, r22
;		beq   r22, LOOP_END
;		ldq   r0, (r5)
;		lda   r5, 8(r5)
;       	lda   r1, 1(r1)
;		cmple r0, r2, r22
;		bne   r22, LOOP_START
;	LOOP_END:
;--
.macro UNROLL BODY, FACTOR, CONTINUE, EXIT, COMPARE, TEST_FIRST=1
    .if not_defined, UNROLL_INDEX
	UNROLL_INDEX = 0
    .else
	UNROLL_INDEX = UNROLL_INDEX + 1
    .endc
    LOOP_START = "UNROLL_LOOP_START_%integer(UNROLL_INDEX)"
    LOOP_END   = "UNROLL_LOOP_END_%integer(UNROLL_INDEX)"
    UNROLL_HELPER <BODY>, <FACTOR>, <%LOOP_START%>, <%LOOP_END%>, -
	<CONTINUE>, <EXIT>, <COMPARE>, <TEST_FIRST>
.endm UNROLL

; Helper macro for loop unrolling
.macro UNROLL_HELPER BODY, FACTOR, START, END, CONTINUE, EXIT, COMPARE, -
    TEST_FIRST
    .if ne, <TEST_FIRST>
	COMPARE
	EXIT
    .endc
    START:
    .repeat <<FACTOR>-1> * LOOP_UNROLL
	BODY
	COMPARE
	EXIT
    .endr
    BODY
    COMPARE
    CONTINUE
    END:
.endm UNROLL_HELPER


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                       MODULE LOCAL ROUTINES
;---------------------------------------------------------------------------


	$routine ASK_NUMBER_OF_DIGITS, kind=stack, local=true, 		-
	    data_section_pointer=true,					-
	    saved_regs=<r2,r3>
;++
; Abstract:	Inquire from the user terminal who many digits to compute.
; Inputs:	Terminal I/O
; Outputs:	PRECISION - Number of digits to compute, also returned in R0
;--
	$linkage_section
10$:	.asciz "How many digits do you want to compute? "
20$:	.asciz "Computing PI with %-d digits \x0A\x0A"
30$:	.asciz "Please specify a positive, non-zero integer value.\x0A"
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
100$:	$call 	DECC$GPRINTF, args=10$/a	; Print "How many?"
	$call	DECC$GETS, args=SCRATCH/a	; Get answer
	bne	r0, 200$			; Branch if not ^Z
	stq	r0, PRECISION
	br	400$
200$:	$call 	DECC$ATOL, args=SCRATCH/a	; Convert to longword
	bgt	r0, 300$
	$call	DECC$GPRINTF, args=30$/a	; Print "be positive"
	br	100$
300$:	stq	r0, PRECISION			; Set precision
	; Output "Computing with N digits"
	$call	DECC$GPRINTF, args=<20$/a, r0/l>, scratch_regs=<r22,r23>
	ldq	r0, PRECISION
400$:	$return
	$end_routine ASK_NUMBER_OF_DIGITS


	$routine INITIALIZE, kind=stack, local=true, 			-
	    saved_regs=<r2,r3,r4,r5,r7>,				-
	    data_section_pointer=true
;++
; Abstract:	Start up processing.
; Inputs:	
;   PRECISION
; Outputs:
;   LOG_4		  - log10(4.0)
;   IMAX		  - PRECISION/LOG_4
;   WEIGHT		  - Scale factor for various computations
;   CLUSTER_SIZE	  - Number of decimal digits represented in a quadword 
;   CLUSTER_VECT_SIZE 	  - Number of quadwords needed for spec'd precision
;   CLUSTER_VECT	  - Ptr to array of quadwords 
;   CLUSTER_VECT[0..last] - Initialized to 0
;   FILE_VAR		  - Opened for output to PI.DAT
;--
	$linkage_section
	.define_ireg POWX r4		; r4 holds pow(2,BIGINT_BITS-4)/IMAX*10
	.define_ireg W    r5		; r5 is temp for WEIGHT
	.define_ireg CS   r6		; r6 is temp for CLUSTER_SIZE
	.define_ireg CVS  r7		; r7 is temp for CLUSTER_VECT_SIZE
	IPOW	2 <BIGINT_BITS-4> T	; T=pow(2,BIGINT_BITS-4)
10$:	.quad		T
20$:	.asciz "pi.dat"
30$:	.asciz "w"
40$:	.asciz "        PI WITH %-d digits \x0A \x0A"
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	.base	r3, $ds				; ...data section
	$call	DECC$GLOG10, args=G4/g		; r0=LOG10(4.0)
	stg	f0, LOG_4			; LOG_4=LOG10(4.0)
	ldt	f1, PRECISION			; Convert precision to...
	cvtqg	f1, f1				; ...G float
	divg	f1, f0, f1			; f1=PRECISION/LOG_4
	cvtgq/c	f1, f1				; Convert to quadword
	stt	f1, IMAX			; IMAX=PRECISION/LOG_4
	ldq	$IA1, IMAX			; IA1=IMAX
	mulq	$IA1, #10, $IA1			; IA1=IMAX*10

	;++
	; The OTS$DIV_L routine takes 2 quad word arguments and returns 
	; in r0 the quadword quotient of the first argument divided by the 
	; second argument.  OTS$DIV_L is in the sharable library LIBOTS.EXE, 
	; which is available at link time by default.  A similar routine, 
	; OTS$DIV_I, is available in that same library for longword integer 
	; division.
	;--
	; r0=pow(2,BIGINT_BITS-4)/(IMAX*10)
	$call	OTS$DIV_L, args=<10$/q, $IA1/q> 
	; POWX=pow(2,BIGINT_BITS-4)/(IMAX*10)
	mov	r0, POWX
	mov 	1, W 				; WEIGHT=1
	mov	0, CS				; CLUSTER_SIZE=0

	; Loop computing WEIGHT and CLUSTER_SIZE
	.macro LOOP_BODY			; Loop body for unrolling
	    mulq	W, #10, W		; WEIGHT=WEIGHT*10
	    lda		CS, 1(CS)		; CLUSTER_SIZE=CLUSTER_SIZE+1
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=10, TEST_FIRST=1, 		-
	    COMPARE  = <cmple W, POWX, r23>,				-
	    CONTINUE = <bne r23, %%LOOP_START%%>,			-
	    EXIT     = <beq r23, %%LOOP_END%%>

	stq	W, WEIGHT			; Store local copies of 
	stq	CS, CLUSTER_SIZE		;  WEIGHT and CLUSTER_SIZE
	ldq	$IA0, PRECISION			; IA0=PRECISION
	addq	CS, $IA0, $IA0			; IA0=PRECISION+CLUSTER_SIZE
	lda	$IA0, -1($IA0)			; IA0=PRECISION+CLUSTER_SIZE-1
	; r0=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	$call	OTS$DIV_L, args=<$IA0/q,CS/q>
	; r0=((PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE)+1
	lda	r0, 1(r0)
	; CLUSTER_VECT_SIZE=((PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE)+1
	mov	r0, CVS
	stq	r0, CLUSTER_VECT_SIZE
	; Allocate CLUSTER_VECT
	sll	r0, #BIGINT_SHIFTER, r0
	$call	DECC$MALLOC, args=r0/l, scratch_regs=<r22,r23>
	stq	r0, CLUSTER_VECT
    	.define_ireg CCA r0			; r0 is current cluster addr
	; Initialize CLUSTER_VECT
	mov	CVS, r1			; r1=CLUSTER_VECT_SIZE
100$:	stq	r31, (CCA)			; *CCA = 0
	lda	r1, -1(r1)			; --r1
	lda	r0, BIGINT_BYTES(R0)		; CCA++
	bgt	r1, 100$			; Branch if not done	
	$call	DECC$FOPEN, args=<20$/a, 30$/a>	; Open output file
	stq	r0, FILE_VAR
	$call	DECC$GFPRINTF, args=<r0/q, 40$/a, PRECISION/l>, 	-
	    scratch_regs=<r22,r23>
	$return
	.undefine_reg POWX 
	.undefine_reg W    
	.undefine_reg CS   
	.undefine_reg CVS  
    	.undefine_reg CCA
	$end_routine INITIALIZE


	$routine MULTIPLY_AND_DIVIDE, kind=stack, 			-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10>,			-
	    data_section_pointer=true,					-
	    local=true
;++
; Abstract: Walk the CLUSTER_VECTOR array performing a series of
;            multiplications and divisions.
; Inputs:
;   IA0 - NUMERATOR
;   IA1 - DENOMINATOR
;   ISTOP - Indicator of how much to compute
;   WEIGHT
;   CLUSTER_VECT
;   CLUSTER_VECT[0..ISTOP]
; Outputs:
;   CLUSTER_VECT[0..ISTOP]
;--
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	.define_ireg TEMP r4			; r4 is variable 'temp'
	.define_ireg I	  r5			; r5 is variable 'i'
	.define_ireg W    r6			; r6 is local WEIGHT
	.define_ireg CCA  r7			; r7 is curr cluster addr
	.define_ireg NUM  r8			; r8 is local copy of NUM
	.define_ireg DEN  r9			; r9 is local copy of DEN
	.define_ireg CC   r10			; r10 is curr cluster
	clr	TEMP				; TEMP=0
	ldq	I, ISTOP			; I=ISTOP
	lda	I, 1(I)			; I=ISTOP+1
	ldq	W, WEIGHT			; Load local copy of WEIGHT
	ldq	CCA, CLUSTER_VECT		; CCA = &CLUSTER_VECT[0]
	mov	$IA0, NUM			; Get local copy of NUM
	mov	$IA1, DEN			; Get local copy of DEN
	.macro LOOP_BODY			; Unroll loop
	    mulq	TEMP, W, TEMP	; TEMP *= WEIGHT
	    ldq		CC, (CCA)		; CC = *CCA
	    mulq	NUM, CC, r22	; r22 = CC * NUM
	    addq	TEMP, r22, TEMP	; TEMP += CC * NUM
	    $call	OTS$DIV_L, -		; r0=TEMP/DEN
		args=<TEMP/q, DEN/q>
	    stq		r0, (CCA)		; *CCA=TEMP/DEN
	    mulq	DEN, r0, r0		; r0=DEN * (*CCA)
	    subq	TEMP, r0, TEMP	; TEMP -= DEN * (*CCA)
	    lda		CCA, BIGINT_BYTES(CCA); Advance to next cluster
	    lda		I, -1(I)		; Decrement loop index
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=8, TEST_FIRST=1, 			-
	    CONTINUE = <bne I, %%LOOP_START%%>,			-
	    EXIT     = <beq I, %%LOOP_END%%>
	$return
	.undefine_reg TEMP
	.undefine_reg I
	.undefine_reg W    
	.undefine_reg CCA  
	.undefine_reg NUM  
	.undefine_reg DEN  
	.undefine_reg CC   
	$end_routine MULTIPLY_AND_DIVIDE


    ;++
    ; Macro PROPAGATE_CARRY
    ; Abstract:  Walk the CLUSTER_VECTOR array in reverse order propagating
    ;            a carry out from one cluster to its predecessor as necessary.
    ;            (A manually inlined routine.)
    ; Inputs:
    ;	CA - register that holds &CLUSTER_VECT[0]
    ;	WEIGHT
    ;	CLUSTER_VECT[0..last]
    ; Outputs:
    ;	CLUSTER_VECT[0..last]
    ;--
	.macro PROPAGATE_CARRY
	    .define_ireg CCA 	r0		; r0 is addr of current cluster
	    .define_ireg CC	r1		; r1 is current cluster
	    .define_ireg CARRY	r22		; r22 is 'CARRY' variable
	    .define_ireg W	r23		; r23 is local WEIGHT
	    clr		CARRY			; CARRY=0
	    ldq		W, WEIGHT		; load local copy of WEIGHT
	    ldq		CCA, CLUSTER_VECT_SIZE	; CCA=sizeof(CLUSTER_VECT)
	    lda		CCA, -1(CCA)		; CCA=sizeof(CLUSTER_VECT)-1
	    ; CCA=(sizeof(CLUSTER_VECT)-1)*sizeof(CLUSTER_VECT[0])
	    sll		CCA, #BIGINT_SHIFTER, CCA
	    addq	CA, CCA, CCA		; CCA --> last cluster
	    .macro LOOP1_BODY			; Unroll loop 1
		ldq	CC, (CCA)		; r22 = *CCA
		addq	CC, CARRY, CC		; r22 = *CCA + CARRY
		clr	CARRY			; CARRY=0
		.macro LOOP2_BODY 		; Unroll loop 2
		    subq	CC, W, CC 	; CC -= WEIGHT
		    lda		CARRY, 1(CARRY)	; CARRY++
		.endm LOOP2_BODY
		UNROLL BODY=LOOP2_BODY, FACTOR=8, TEST_FIRST=1, 	-
		    COMPARE  = <cmplt CC, W, r16>,			-
		    CONTINUE = <beq   r16, %%LOOP_START%%>,		-
		    EXIT     = <bne   r16, %%LOOP_END%%>
		stq	CC, (CCA)		; Save current cluster
		lda	CCA, -BIGINT_BYTES(CCA) ; Backup 1 cluster
	    .endm LOOP1_BODY
	    UNROLL BODY=LOOP1_BODY, FACTOR=2, TEST_FIRST=1, 		-
		COMPARE  = <cmplt  CCA, CA, r16>,			-
		CONTINUE = <beq    r16, %%LOOP_START%%>,		-
		EXIT     = <bne    r16, %%LOOP_END%%>
	    .undefine_reg CCA
	    .undefine_reg CC
	    .undefine_reg CARRY
	    .undefine_reg W
	.endm PROPAGATE_CARRY


	$routine COMPUTE, kind=stack, 					-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10,f2,f3,f4>, 		-
	    local=true,							-
	    data_section_pointer=true,					-
	    size=$rsa_end+16
;++
; Abstract: Build up a binary representation of PI in the CLUSTER_VECT array.
;
; Inputs:
;   PRECISION
;   CLUSTER_SIZE
;   IMAX
;   LOG_4
;   CLUSTER_VECT[0]
;
; Outputs:
;   CLUSTER_VECT[0..last]
;
; Notes:
;   $ROUTINE defines the assembly-time variable $RSA_END as the offset 
;   from the beginning of the stack frame to just beyond the end of the 
;   the register save area.  You can use this variable both to define 
;   the stack size with $ROUTINE's SIZE argument as above, and in 
;   referencing the stack beyond the register save area.  In this case,
;   we have allocated an additional 16 bytes of stack storage beyond the
;   end of the register-save area.  Note that the size of the stack frame
;   must be an even multiple of 16.
;--
	$linkage_section
	.define_ireg I		r4		; r4 is variable 'I'
	.define_ireg CS		r5		; r5 is local CLUSTER_SIZE
	.define_ireg TMP	r6		; r6 is variable 'TMP'
	.define_ireg CA		r7		; r7 is local &CLUSTER_VECT[0]
	.define_ireg PCC	r8		; r8 is variable 'PCC'
	.define_ireg I2M1	r9		; r9 is 2*I-1
	.define_ireg I4		r10		; r10 is 4*I and 4*I+2
	.define_freg I.G	f2		; f2 is variable 'I.G', G float 
						;  shadow of I
	.define_freg L4		f3		; f3 is local LOG_4
	.define_freg ONE.G	f4		; f4 holds 1.0 in G floating
	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	ldq	I, IMAX				; I=IMAX
	ldt	I.G, IMAX			; I.G shadows I
	cvtqg	I.G, I.G			; Convert to G format
	ldg	L4, LOG_4			; Load local LOG_4
	ldq	CS, CLUSTER_SIZE		; Load local CLUSTER_SIZE
	ldq	TMP, PRECISION			; TMP=PRECISION
	mov	TMP, PCC			; PCC=PRECISION
	lda	TMP, 2(TMP)			; TMP=PRECISION+2
	addq	CS, TMP, TMP			; TMP=CLUSTER_SIZE+PRECISION+2
	ldq	CA, CLUSTER_VECT		; CA = &CLUSTER_VECT[0]
	addq	PCC, CS, PCC			; PCC=PRECISION+CLUSTER_SIZE
	lda	PCC, -1(PCC)			; PCC=PRECISION+CLUSTER_SIZE-1
	; r0=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	$call	OTS$DIV_L, args=<PCC/q,CS/q>
	; PCC=(PRECISION+CLUSTER_SIZE-1)/CLUSTER_SIZE
	mov	r0, PCC
	ldg	ONE.G, G1			; Load ONE.G with 1.0
	.macro	LOOP_BODY ?LABEL		; Unroll loop
	    mulg	I.G, L4, f0		; f0=I*LOG_4
	    cvtgq/c	f0, f0			; Convert to integer

	    ;++
	    ; We must store F0 to memory in order to load it into an 
	    ; integer register.  Our invocation of the $ROUTINE macro
	    ; above allocated an additional 16 bytes of stack storage,
	    ; 8 of which we will use now for that purpose.  We can 
	    ; reference the stack beyond the end of the register save 
	    ; area relative to the frame pointer, FP, using the 
	    ; $RSA_END assembly-time variable that is defined by the 
	    ; $ROUTINE macro.
	    ;--
	    stt		f0, $rsa_end(fp)	; Put to memory
	    ldq		$IA0, $rsa_end(fp)	; Read back to I reg
	    ; IA0=(PRECISION-I*LOG_4+2+CLUSTER_SIZE)
	    subq	TMP, $IA0, $IA0
	    $call	OTS$DIV_L, args=<$IA0/q, CS/q>
	    cmple	r0, PCC, r1		; result <= PCC?
	    bne		r1, LABEL    		; Yes, branch
	    mov		PCC, r0			; No, use PCC
LABEL:	    stq		r0, ISTOP		; ISTOP=min(result,PCC)
	    sll		I, #1, I2M1		; I2M1 = 2*I
	    lda		I2M1, -1(I2M1)		; I2M1 = 2*I-1
	    sll		I, #2, I4		; I4 = 4*I
	    $call	MULTIPLY_AND_DIVIDE, args=<I2M1/q, I4/q>, 	-
		local=true
	    lda		I4, 2(I4)		; I4=4*I+2
	    $call	MULTIPLY_AND_DIVIDE, args=<I2M1/q, I4/q>, 	-
		local=true
	    ldq		r0, (CA)		; r0=CLUSTER_VECT[0]
	    lda		r0, 3(R0)		; r0=CLUSTER_VECT[0] + 3
	    stq		r0, (CA)		; CLUSTER_VECT[0] += 3
	    lda		I, -1(I)		; I--
	    subg	I.G, ONE.G, I.G	; I.G shadows I
	.endm LOOP_BODY
	UNROLL BODY=LOOP_BODY, FACTOR=4, TEST_FIRST=1, 			-
	    CONTINUE = <bne I, %%LOOP_START%%>,				-
	    EXIT     = <beq I, %%LOOP_END%%>

	PROPAGATE_CARRY				; Inlined routine call

	$return
	.undefine_reg I
	.undefine_reg CS
	.undefine_reg TMP
	.undefine_reg CA
	.undefine_reg PCC
	.undefine_reg I2M1
	.undefine_reg I4
	.undefine_reg I.G
	.undefine_reg L4
	.undefine_reg ONE.G
	$end_routine COMPUTE


	$routine PRINT_RESULT, kind=stack, 				-
	    saved_regs=<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,f2>,-
	    local=true,							-
	    data_section_pointer=true,					-
	    size=$rsa_end+16
;++
; Abstract:  Print the results of the computation.
; Inputs:
;   START  - seconds at start of computation
;   FINISH - seconds at end of computation
;   FILE_VAR
;   PRECISION
;   WEIGHT
;   CLUSTER_SIZE
;   CLUSTER_VECT_SIZE
;   CLUSTER_VECT[0..last]
;   PRINT_WIDTH
;--
	$linkage_section
	.define_freg SECONDS	f2		; f2 is variable 'SECONDS' 
	.define_ireg MINUTES	r4		; r4 is variable 'MINUTES'
	.define_ireg HOURS	r5		; r5 is variable 'HOURS'
	.define_ireg I		r6		; r6 is variable 'I' 
	.define_ireg CCA	r7		; r7 is variable 'CCA' 
	.define_ireg CC		r8		; r8 is variable 'CC'
	.define_ireg CLIM	r9		; r9 is variable 'CLIM'
	.define_ireg J		r10		; r10 is variable 'J'
	.define_ireg CS		r11		; r11 is local CLUSTER_SIZE
	.define_ireg W		r12		; r12 is local WEIGHT
	.define_ireg COUNT	r13		; r13 is variable COUNT
	.define_ireg P		r14		; r14 is local PRECISION
10$:	.asciz	-
"        Calculation time :  %-d hours %-d  minutes %f seconds \x0A \x0A"
20$:	.asciz	"%-d."
30$:	.asciz  "%-d"
40$:	.asciz	"\x0A  "
50$:	.asciz	"\x0A"


	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	ldq	r22, FINISH			; r22=FINISH
	ldq	r23, START			; r23=START
	subq	r22, r23, r23			; r23=FINISH-START
	stq	r23, $rsa_end(fp)		; Convert FINISH-START
	ldt	f0,  $rsa_end(fp)		; ...to...
	cvtqg	f0, f0				; ...G floating
	ldg	f1, CLK_TCK			; f1=CLK_TCK
	divg	f0, f1, SECONDS			; SECONDS=(FINISH-START)/CLK_TCK
	cvtgq/c	SECONDS, f0			; Convert...
	stt	f0,  $rsa_end(fp)		; ...to integer
	$call	OTS$DIV_L, args=<$rsa_end(fp)/q, 60/a>	; r0=SECONDS/60
	mov	r0, MINUTES			; MINUTES=SECONDS/60
	mulq	r0, #60, r0			; r0=MINUTES*60
	stq	r0, $rsa_end(fp)		; Convert...
	ldt	f0, $rsa_end(fp)		; ...to...
	cvtqg	f0, f0				; G floating
	subg	SECONDS, f0, SECONDS		; SECONDS -= MINUTES*60
	$call	OTS$DIV_L, -			; r0=MINUTES/60
	    args=<MINUTES/q, 60/a>
	mov	r0, HOURS			; HOURS=MINUTES/60
	mulq	HOURS, #60, r0			; r0=HOURS*60
	subq	MINUTES, r0, MINUTES		; MINUTES -= HOURS*60
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 10$/a, HOURS/l, MINUTES/l, SECONDS/g>
	ldq	CCA, CLUSTER_VECT		; CCA = &CLUSTER_VECT[0]
	ldq	CLIM, CLUSTER_VECT_SIZE		; CLIM=sizeof(CLUSTER_VECT)
	; CLIM=sizeof(CLUSTER_VECT)*sizeof(CLUSTER_VECT[0])
	sll	CLIM, #BIGINT_SHIFTER, CLIM	
	addq	CCA, CLIM, CLIM			; CLIM --> beyond last cluster
	ldq	CC, (CCA)			; CC = CLUSTER_VECT[0]
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 20$/a, CC/l>
	mov	1, I				; I=1
	ldq	CS, CLUSTER_SIZE		; Load local cluster size
	lda	CCA, BIGINT_BYTES(CCA)		; Advance to next cluster
	cmplt	CCA, CLIM, r0			; Beyond last cluster?
	beq	r0, 400$			; Yes, branch to exit loop 1
	ldq	W, WEIGHT			; Load local copy of WEIGHT
	ldq	P, PRECISION			; Load local copy of PRECISION
	clr	COUNT				; COUNT=0
100$:	ldq	CC, (CCA)			; CC = *CCA
	mov	1, J				; J=1
	cmple	J, CS, r1			; J<=CLUSTER_SIZE?
	beq	r1, 300$			; No, branch to exit loop 2
200$:	mulq	CC, #10, CC			; CC *= 10
	$call	OTS$DIV_L, args=<CC/q, W/q>	; r0=CC/WEIGHT
	mov	r0, $IA2			; IA2=CC/WEIGHT
	mulq	W, r0, r0			; r0=IA2*WEIGHT
	subq	CC, r0, CC			; CC -= IA2*WEIGHT
	cmple	COUNT, P, r1			; COUNT<=PRECISION?
	beq	r1, 210$			; No, branch
	sextl	$IA2, $IA2			; Convert to longword
	$call	DECC$GFPRINTF, -
	    args=<FILE_VAR/q, 30$/a, $IA2/l>
210$:	lda	COUNT, 1(COUNT)			; COUNT++
	cmplt	I, #PRINT_WIDTH, r0		; I>=PRINT_WIDTH?
	bne	r0, 220$			; No, branch
	clr	I				; I=0
	$call	DECC$GFPRINTF, -		; Print linefeed + 2 spaces
	    args=<FILE_VAR/q, 40$/a>
220$:	lda	I, 1(I)				; I++
	lda	J, 1(J)				; J++
	cmple	J, CS, r0			; J<=CLUSTER_SIZE?
	bne	r0, 200$			; Yes, branch to continue loop 2
300$:	lda	CCA, BIGINT_BYTES(CCA)		; Advance to next cluster
	cmplt	CCA, CLIM, r0			; Beyond last cluster?
	bne	r0, 100$			; No, branch to continue loop 1
	$call	DECC$GFPRINTF, -		; Print linefeed 
	    args=<FILE_VAR/q, 50$/a>
400$:	$return
	.undefine_reg SECONDS
	.undefine_reg MINUTES
	.undefine_reg HOURS
	.undefine_reg I
	.undefine_reg CCA
	.undefine_reg CC
	.undefine_reg CLIM
	.undefine_reg J
	.undefine_reg CS
	.undefine_reg W
	.undefine_reg COUNT
	.undefine_reg P
	$end_routine PRINT_RESULT


	$routine CLEANUP, kind=stack,					-
	    local=true,							-
	    data_section_pointer=true
;++
; Abstract: Perform shutdown trivia.
; Inputs:  FILE_VAR
; Outputs: The output file is closed.
;--
	.base	r27, $ls			; Access linkage section
	ldq	r22, $dp			; Access...
	.base	r22, $ds			; ...data section
	$call	DECC$FCLOSE, args=FILE_VAR/q
	$return
	$end_routine CLEANUP


;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;                            EXTERNAL ROUTINES
;---------------------------------------------------------------------------

	$routine PI, kind=stack, saved_regs=<r2,r3>,			-
	    data_section_pointer=true, local=false
;++
; Abstract: Program entry point.  
; Outputs:
;   PI.DAT with the specified number of decimal places.
;--
	$linkage_section
COPYRIGHT_MSG:
	.asciz - ; \xA9 is '©', \x0A is <lf>, or '\n' in C parlance
"\xA9 Digital Equipment Corporation 1992, 1993.  All rights reserved.\x0A"

	$code_section
	.base	r27, $ls			; 1st access link sect w/r27
	mov	r27, r2				; Later use r2 for link sect 
	ldq	r3, $dp				; Access...
	.base	r3, $ds				; ...data section
	.base	r2, $ls				; Now use r2 for link sect
	$call	DECC$GPRINTF, -			; Display copyright notice.
	    args=COPYRIGHT_MSG/A
	$call	ASK_NUMBER_OF_DIGITS, local=true
	bgt	r0, 10$				; Branch if positive precision
	mov	1, r0				; Return success status
	$return
10$:	$call	INITIALIZE, local=true
	$call	DECC$CLOCK
	stq	r0, START
	$call	COMPUTE, local=true
	$call	DECC$CLOCK
	stq	r0, FINISH
	$call	PRINT_RESULT, local=true
	$call	CLEANUP, local=true
	mov	1, r0				; Return success status
	$return
	$end_routine PI

	.end PI