source: trunk/lib_ir/AgnerTestP/PMCTest/PMCTestB64.asm @ 4221

Last change on this file since 4221 was 4221, checked in by linmengl, 5 years ago

initial checkin of Agner Fog's performance script

File size: 13.7 KB
Line 
1comment & ---------------------------------------------------------------------
2                          PMCTestB64.asm              © 2013-08-20 Agner Fog
3
4                PMC Test program for multiple threads
5
6This program is intended for testing the performance of a little piece of
7code written in assembly language.
8The code to test is inserted at the place marked "Test code start".
9All sections that can be modified by the user are marked with ###########.
10
11The code to test will be executed REPETITIONS times and the test results
12will be output for each repetition. This program measures how many clock
13cycles the code to test takes in each repetition. Furthermore, it is
14possible to set a number of Performance Monitor Counters (PMC) to count
15the number of micro-operations (uops), cache misses, branch mispredictions,
16etc.
17
18The setup of the Performance Monitor Counters is microprocessor-specific.
19The specifications for PMC setup for each microprocessor family is defined
20in the tables CounterDefinitions and CounterTypesDesired.
21
22See PMCTest.txt for instructions.
23
24© 2000-2013 GNU General Public License www.gnu.org/licenses
25
26----------------------------------------------------------------------------- &
27
28; Operating system: 0 = Linux, 1 = Windows
29WINDOWS  EQU    1
30
31; Define whether AVX and YMM registers used
32USEAVX        = 0
33
34; Define cache line size (to avoid threads sharing cache lines):
35CACHELINESIZE = 64
36
37DATA SEGMENT ALIGN(CACHELINESIZE)
38
39;##############################################################################
40;#
41;#            List of desired counter types and other user definitions
42;#
43;##############################################################################
44 
45; Here you can select which performance monitor counters you want for your test.
46; Select id numbers from the table CounterDefinitions[] in PMCTestA.cpp.
47
48USE_PERFORMANCE_COUNTERS  equ  1        ; Tell if you are using performance counters
49
50CounterTypesDesired label DWORD
51    DD      1        ; core cycles (Intel only)
52    DD      9        ; instructions
53    DD    100        ; uops
54    DD    101        ; data cache misses
55
56   
57; Number of counters defined
58IF USE_PERFORMANCE_COUNTERS
59NUM_COUNTERS = ($ - CounterTypesDesired) / 4
60ELSE
61NUM_COUNTERS = 0
62ENDIF
63
64; Number of repetitions of test.
65REPETITIONS = 12
66
67; Number of threads
68NUM_THREADS = 1
69
70; Subtract overhead from clock counts (0 if not)
71SUBTRACT_OVERHEAD = 1
72
73; Number of repetitions in loop to find overhead
74OVERHEAD_REPETITIONS = 4
75
76; Maximum number of PMC counters
77MAXCOUNTERS = 6              ; must match value in PMCTest.h
78
79IF NUM_COUNTERS GT MAXCOUNTERS
80   NUM_COUNTERS = MAXCOUNTERS
81ENDIF
82
83; Define array sizes
84MAXREPEAT = REPETITIONS
85
86;------------------------------------------------------------------------------
87;
88;                  global data
89;
90;------------------------------------------------------------------------------
91
92public NumCounters, MaxNumCounters, EventRegistersUsed
93public UsePMC, Counters, CounterTypesDesired
94public PThreadData, ClockResultsOS, PMCResultsOS, NumThreads, ThreadDataSize
95public RatioOut, TempOut, RatioOutTitle, TempOutTitle
96
97
98; Per-thread data:
99ALIGN   CACHELINESIZE
100; Data for first thread
101ThreadData label dword                                     ; beginning of thread data block
102CountTemp        DD    MAXCOUNTERS + 1            dup (0)  ; temporary storage of counts
103CountOverhead    DD    MAXCOUNTERS + 1            dup (-1) ; temporary storage of count overhead
104ClockResults     DD    REPETITIONS                dup (0)  ; clock counts
105PMCResults       DD    REPETITIONS * MAXCOUNTERS  dup (0)  ; PMC counts
106align 8
107RSPSave          DQ    0                                 ; save stack pointer
108ALIGN   CACHELINESIZE                  ; Make sure threads don't use same cache lines
109THREADDSIZE = (offset $ - offset ThreadData)          ; size of data block for each thread
110
111; Define data blocks of same size for remaining threads
112IF NUM_THREADS GT 1
113DB (NUM_THREADS - 1) * THREADDSIZE DUP (0)
114ENDIF
115
116; Global data
117PThreadData     DQ    ThreadData               ; Pointer to measured data for all threads
118NumCounters     DD    0                        ; Will be number of valid counters
119MaxNumCounters  DD    NUM_COUNTERS             ; Tell PMCTestA.CPP length of CounterTypesDesired
120UsePMC          DD    USE_PERFORMANCE_COUNTERS ; Tell PMCTestA.CPP if RDPMC used. Driver needed
121NumThreads      DD    NUM_THREADS              ; Number of threads
122ThreadDataSize  DD    THREADDSIZE              ; Size of each thread data block
123ClockResultsOS  DD    ClockResults-ThreadData  ; Offset to ClockResults
124PMCResultsOS    DD    PMCResults-ThreadData    ; Offset to PMCResults
125Counters        DD    MAXCOUNTERS dup (0)      ; Counter register numbers used will be inserted here
126EventRegistersUsed DD MAXCOUNTERS dup (0)      ; Set by MTMonA.cpp
127
128
129; optional extra output column definitions
130RatioOut      DD   0, 0, 0, 0                ; optional ratio output. Se PMCTest.h
131TempOut       DD   0, 0                      ; optional arbitrary output. Se PMCTest.h
132RatioOutTitle DQ   0                         ; optional column heading
133TempOutTitle  DQ   0                         ; optional column heading
134
135
136;##############################################################################
137;#
138;#                 User data
139;#
140;##############################################################################
141ALIGN   CACHELINESIZE
142
143; Put any data definitions your test code needs here
144
145d0 label dword
146q0 label qword
147UserData         DD    1000H dup (0)
148
149
150;------------------------------------------------------------------------------
151;
152;                  Macro definitions
153;
154;------------------------------------------------------------------------------
155
156SERIALIZE MACRO             ; serialize CPU
157       xor     eax, eax
158       cpuid
159ENDM
160
161CLEARXMMREG MACRO N         ; set xmm(N) register to 0
162        pxor xmm&N, xmm&N
163ENDM             
164
165CLEARALLXMMREG MACRO        ; set all xmm registers to 0
166IF  USEAVX
167        VZEROALL            ; clear all ymm registers
168ELSE       
169                I = 0
170                REPT 16
171        CLEARXMMREG %I      ; clear all xmm registers
172                I = I + 1
173                ENDM
174ENDIF
175ENDM             
176
177;------------------------------------------------------------------------------
178;
179;                  Test Loop
180;
181;------------------------------------------------------------------------------
182.code
183
184;extern "C" int TestLoop (int thread) {
185; This function runs the code to test REPETITIONS times
186; and reads the counters before and after each run:
187
188TestLoop PROC
189        push    rbx
190        push    rbp
191        push    r12
192        push    r13
193        push    r14
194        push    r15
195IF      WINDOWS                    ; These registers must be saved in Windows, not in Linux
196        push    rsi
197        push    rdi
198        sub     rsp, 0A8H           ; Space for saving xmm6 - 15 and align
199        movaps  [rsp], xmm6
200        movaps  [rsp+10H], xmm7
201        movaps  [rsp+20H], xmm8
202        movaps  [rsp+30H], xmm9
203        movaps  [rsp+40H], xmm10
204        movaps  [rsp+50H], xmm11
205        movaps  [rsp+60H], xmm12
206        movaps  [rsp+70H], xmm13
207        movaps  [rsp+80H], xmm14
208        movaps  [rsp+90H], xmm15       
209        mov     r15d, ecx          ; Thread number
210ELSE    ; Linux
211        mov     r15d, edi          ; Thread number
212ENDIF
213       
214; Register use:
215;   r13: pointer to thread data block
216;   r14: loop counter
217;   r15: thread number
218;   rax, rbx, rcx, rdx: scratch
219;   all other registers: available to user program
220
221
222;##############################################################################
223;#
224;#                 User Initializations
225;#
226;##############################################################################
227; You may add any initializations your test code needs here.
228; Registers esi, edi, ebp and r8 - r12 will be unchanged from here to the
229; Test code start.
230;
231
232        finit                ; clear all FP registers
233       
234        CLEARALLXMMREG       ; clear all xmm or ymm registers
235
236        lea rsi, d0
237        lea rdi,[rsi+120h]
238        xor ebp,ebp
239       
240
241;##############################################################################
242;#
243;#                 End of user Initializations
244;#
245;##############################################################################
246
247        lea     r13, [ThreadData]             ; address of first thread data block
248        ;imul    eax, r15d, THREADDSIZE       ; offset to thread data block
249        DB      41H, 69H, 0C7H                ; fix bug in ml64
250        DD      THREADDSIZE
251        add     r13, rax                      ; address of current thread data block
252        mov     [r13+(RSPSave-ThreadData)],rsp ; save stack pointer
253
254IF  SUBTRACT_OVERHEAD
255; First test loop. Measure empty code
256        xor     r14d, r14d                    ; Loop counter
257
258TEST_LOOP_1:
259
260        SERIALIZE
261     
262        ; Read counters
263        I = 0
264REPT    NUM_COUNTERS
265        mov     ecx, [Counters + I*4]
266        rdpmc
267        mov     [r13 + I*4 + 4 + (CountTemp-ThreadData)], eax
268        I = I + 1
269ENDM       
270
271        SERIALIZE
272
273        ; read time stamp counter
274        rdtsc
275        mov     [r13 + (CountTemp-ThreadData)], eax
276
277        SERIALIZE
278
279        ; Empty. Test code goes here in next loop
280
281        SERIALIZE
282
283        ; read time stamp counter
284        rdtsc
285        sub     [r13 + (CountTemp-ThreadData)], eax        ; CountTemp[0]
286
287        SERIALIZE
288
289        ; Read counters
290        I = 0
291REPT    NUM_COUNTERS
292        mov     ecx, [Counters + I*4]
293        rdpmc
294        sub     [r13 + I*4 + 4 + (CountTemp-ThreadData)], eax  ; CountTemp[I+1]
295        I = I + 1
296ENDM       
297
298        SERIALIZE
299
300        ; find minimum counts
301        I = 0
302REPT    NUM_COUNTERS + 1
303        mov     eax, [r13+I*4+(CountTemp-ThreadData)]       ; -count
304        neg     eax
305        mov     ebx, [r13+I*4+(CountOverhead-ThreadData)]   ; previous count
306        cmp     eax, ebx
307        cmovb   ebx, eax
308        mov     [r13+I*4+(CountOverhead-ThreadData)], ebx   ; minimum count       
309        I = I + 1
310ENDM       
311       
312        ; end second test loop
313        inc     r14d
314        cmp     r14d, OVERHEAD_REPETITIONS
315        jb      TEST_LOOP_1
316
317ENDIF   ; SUBTRACT_OVERHEAD
318
319       
320; Second test loop. Measure user code
321        xor     r14d, r14d                    ; Loop counter
322
323TEST_LOOP_2:
324
325        SERIALIZE
326     
327        ; Read counters
328        I = 0
329REPT    NUM_COUNTERS
330        mov     ecx, [Counters + I*4]
331        rdpmc
332        mov     [r13 + I*4 + 4 + (CountTemp-ThreadData)], eax
333        I = I + 1
334ENDM       
335
336        SERIALIZE
337
338        ; read time stamp counter
339        rdtsc
340        mov     [r13 + (CountTemp-ThreadData)], eax
341
342        SERIALIZE
343
344;##############################################################################
345;#
346;#                 Test code start
347;#
348;##############################################################################
349
350; Put the assembly code to test here
351; Don't modify r13, r14, r15!
352
353; œœ
354
355rept 100        ; example: 100 shift instructions
356
357shr eax,5
358
359endm
360
361
362
363;##############################################################################
364;#
365;#                 Test code end
366;#
367;##############################################################################
368
369        SERIALIZE
370
371        ; read time stamp counter
372        rdtsc
373        sub     [r13 + (CountTemp-ThreadData)], eax        ; CountTemp[0]
374
375        SERIALIZE
376
377        ; Read counters
378        I = 0
379REPT    NUM_COUNTERS
380        mov     ecx, [Counters + I*4]
381        rdpmc
382        sub     [r13 + I*4 + 4 + (CountTemp-ThreadData)], eax  ; CountTemp[I+1]
383        I = I + 1
384ENDM       
385
386        SERIALIZE
387
388        ; subtract counts before from counts after
389        mov     eax, [r13 + (CountTemp-ThreadData)]            ; -count
390        neg     eax
391IF      SUBTRACT_OVERHEAD
392        sub     eax, [r13+(CountOverhead-ThreadData)]   ; overhead clock count       
393ENDIF   ; SUBTRACT_OVERHEAD       
394        mov     [r13+r14*4+(ClockResults-ThreadData)], eax      ; save clock count
395       
396        I = 0
397REPT    NUM_COUNTERS
398        mov     eax, [r13 + I*4 + 4 + (CountTemp-ThreadData)]
399        neg     eax
400IF      SUBTRACT_OVERHEAD
401        sub     eax, [r13+I*4+4+(CountOverhead-ThreadData)]   ; overhead pmc count       
402ENDIF   ; SUBTRACT_OVERHEAD       
403        mov     [r13+r14*4+I*4*REPETITIONS+(PMCResults-ThreadData)], eax      ; save count       
404        I = I + 1
405ENDM       
406       
407        ; end second test loop
408        inc     r14d
409        cmp     r14d, REPETITIONS
410        jb      TEST_LOOP_2
411
412        ; clean up
413        mov     rsp, [r13+(RSPSave-ThreadData)]   ; restore stack pointer       
414        finit
415        cld
416IF USEAVX
417        VZEROALL                 ; clear all ymm registers
418ENDIF       
419
420        ; return REPETITIONS;
421        mov     eax, REPETITIONS   ; return value
422       
423IF      WINDOWS                    ; Restore registers saved in Windows
424        movaps  xmm6, [rsp]
425        movaps  xmm7, [rsp+10H]
426        movaps  xmm8, [rsp+20H]
427        movaps  xmm9, [rsp+30H]
428        movaps  xmm10, [rsp+40H]
429        movaps  xmm11, [rsp+50H]
430        movaps  xmm12, [rsp+60H]
431        movaps  xmm13, [rsp+70H]
432        movaps  xmm14, [rsp+80H]
433        movaps  xmm15, [rsp+90H]
434        add     rsp, 0A8H           ; Free space for saving xmm6 - 15
435        pop     rdi
436        pop     rsi
437ENDIF
438        pop     r15
439        pop     r14
440        pop     r13
441        pop     r12
442        pop     rbp
443        pop     rbx
444        ret
445       
446TestLoop ENDP
447
448END
Note: See TracBrowser for help on using the repository browser.