source: trunk/lib_ir/AgnerTestP/PMCTest/PMCTestB32.nasm @ 4221

Last change on this file since 4221 was 4221, checked in by linmengl, 5 years ago

initial checkin of Agner Fog's performance script

File size: 13.9 KB
Line 
1;----------------------------------------------------------------------------
2;                        PMCTestB32.nasm              © 2013-08-20 Agner Fog
3;
4;                PMC Test program for multiple threads
5;                           NASM syntax
6; Linux version (for Windows, put underscore prefix on all global names)
7;
8; This program is intended for testing the performance of a little piece of
9; code written in assembly language.
10; The code to test is inserted at the place marked "Test code start".
11; All sections that can be modified by the user are marked with ###########.
12;
13; The code to test will be executed REPETITIONS times and the test results
14; will be output for each repetition. This program measures how many clock
15; cycles the code to test takes in each repetition. Furthermore, it is
16; possible to set a number of Performance Monitor Counters (PMC) to count
17; the number of micro-operations (uops), cache misses, branch mispredictions,
18; etc.
19;
20; The setup of the Performance Monitor Counters is microprocessor-specific.
21; The specifications for PMC setup for each microprocessor family is defined
22; in the tables CounterDefinitions and CounterTypesDesired.
23;
24; See PMCTest.txt for instructions.
25;
26; (c) Copyright 2000 - 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
27;----------------------------------------------------------------------------
28
29; Define whether AVX and YMM registers used
30%ifndef  USEAVX
31%define  USEAVX   1
32%endif
33
34; Define cache line size (to avoid threads sharing cache lines):
35%define CACHELINESIZE  64
36
37; Define warmup count to get into max frequency state
38%define WARMUPCOUNT 10000000
39
40global TestLoop
41global NumCounters
42global MaxNumCounters
43global EventRegistersUsed
44global UsePMC
45global Counters
46global CounterTypesDesired
47global PThreadData
48global ClockResultsOS
49global PMCResultsOS
50global NumThreads
51global ThreadDataSize
52global RatioOut
53global TempOut
54global RatioOutTitle
55global TempOutTitle
56
57
58SECTION .data   align = CACHELINESIZE
59
60;##############################################################################
61;#
62;#            List of desired counter types and other user definitions
63;#
64;##############################################################################
65 
66; Here you can select which performance monitor counters you want for your test.
67; Select id numbers from the table CounterDefinitions[] in PMCTestA.cpp.
68
69%define USE_PERFORMANCE_COUNTERS   1        ; Tell if you are using performance counters
70
71; Maximum number of PMC counters
72%define MAXCOUNTERS   6              ; must match value in PMCTest.h
73
74; Number of PMC counters
75%define NUM_COUNTERS  4              ; must match value in PMCTest.h
76
77CounterTypesDesired:
78    DD      1        ; core cycles (Intel only)
79    DD      9        ; instructions
80    DD    100        ; uops
81    DD    311        ; data cache misses
82
83times (MAXCOUNTERS - ($-CounterTypesDesired)/4)  DD 0
84
85; Number of repetitions of test.
86%define REPETITIONS  8
87
88; Number of threads
89%define NUM_THREADS  3
90
91; Subtract overhead from clock counts (0 if not)
92%define SUBTRACT_OVERHEAD  1
93
94; Number of repetitions in loop to find overhead
95%define OVERHEAD_REPETITIONS  4
96
97; Define array sizes
98%assign MAXREPEAT  REPETITIONS
99
100;------------------------------------------------------------------------------
101;
102;                  global data
103;
104;------------------------------------------------------------------------------
105
106; Per-thread data:
107align   CACHELINESIZE, DB 0
108; Data for first thread
109ThreadData:                                                ; beginning of thread data block
110CountTemp:     times  (MAXCOUNTERS + 1)          DD   0    ; temporary storage of counts
111CountOverhead: times  (MAXCOUNTERS + 1)          DD  -1    ; temporary storage of count overhead
112ClockResults:  times   REPETITIONS               DD   0    ; clock counts
113PMCResults:    times  (REPETITIONS*MAXCOUNTERS)  DD   0    ; PMC counts
114ALIGN   CACHELINESIZE, DB 0                                ; Make sure threads don't use same cache lines
115THREADDSIZE  equ     ($ - ThreadData)                      ; size of data block for each thread
116
117; Define data blocks of same size for remaining threads
118%if  NUM_THREADS > 1
119  times ((NUM_THREADS-1)*THREADDSIZE)            DB 0
120%endif
121
122; Global data
123PThreadData     DD    ThreadData                ; Pointer to measured data for all threads
124NumCounters     DD    0                         ; Will be number of valid counters
125MaxNumCounters  DD    NUM_COUNTERS              ; Tell PMCTestA.CPP length of CounterTypesDesired
126UsePMC          DD    USE_PERFORMANCE_COUNTERS  ; Tell PMCTestA.CPP if RDPMC used. Driver needed
127NumThreads      DD    NUM_THREADS               ; Number of threads
128ThreadDataSize  DD    THREADDSIZE               ; Size of each thread data block
129ClockResultsOS  DD    ClockResults-ThreadData   ; Offset to ClockResults
130PMCResultsOS    DD    PMCResults-ThreadData     ; Offset to PMCResults
131Counters              times MAXCOUNTERS   DD 0  ; Counter register numbers used will be inserted here
132EventRegistersUsed    times MAXCOUNTERS   DD 0  ; Set by MTMonA.cpp
133RatioOut        DD    0, 0, 0, 0                ; optional ratio output. Se PMCTest.h
134TempOut         DD    0                         ; optional arbitrary output. Se PMCTest.h
135RatioOutTitle   DD    0                         ; optional column heading
136TempOutTitle    DD    0                         ; optional column heading
137
138
139%if NUM_THREADS == 1
140ESP_SAVE         dd    0                         ; Save stack pointer if only one thread
141%endif
142
143
144;##############################################################################
145;#
146;#                 User data
147;#
148;##############################################################################
149ALIGN   CACHELINESIZE, DB 0
150
151; Put any data definitions your test code needs here
152
153UserData           times 10000H  DB 0
154
155
156;------------------------------------------------------------------------------
157;
158;                  Macro definitions
159;
160;------------------------------------------------------------------------------
161
162%macro SERIALIZE 0             ; serialize CPU
163       xor     eax, eax
164       cpuid
165%endmacro
166
167%macro CLEARXMMREG 1           ; clear one xmm register
168   pxor xmm%1, xmm%1
169%endmacro
170
171%macro CLEARALLXMMREG 0        ; set all xmm or ymm registers to 0
172   %if  USEAVX
173      VZEROALL                 ; set all ymm registers to 0
174   %else
175      %assign i 0
176      %rep 8
177         CLEARXMMREG i         ; set all 8 xmm registers to 0
178         %assign i i+1
179      %endrep
180   %endif
181%endmacro
182
183
184;------------------------------------------------------------------------------
185;
186;                  Test Loop
187;
188;------------------------------------------------------------------------------
189SECTION .text   align = 16
190
191;extern "C" ;extern "C" int TestLoop (int thread) {
192; This function runs the code to test REPETITIONS times
193; and reads the counters before and after each run:
194
195TestLoop:
196        push    ebx
197        push    esi
198        push    edi
199        push    ebp
200        mov     eax, [esp+16+4]            ; Thread number
201       
202; local variables:
203;   [esp]:   thread number
204;   [esp+4]: pointer to thread data block
205;   [esp+8]: loop counter
206
207        push    0
208        push    0
209        push    eax
210       
211%if NUM_THREADS == 1
212        mov     [ESP_SAVE], esp            ; Save stack pointer if only one thread
213%endif
214
215       
216;##############################################################################
217;#
218;#                 Warm up
219;#
220;##############################################################################
221; Get into max frequency state
222
223%if WARMUPCOUNT
224
225        mov ecx, WARMUPCOUNT / 10
226        mov eax, 1
227        align 16
228Warmuploop:
229        %rep 10
230        imul eax, ecx
231        %endrep
232        dec ecx
233        jnz Warmuploop
234
235%endif
236
237
238;##############################################################################
239;#
240;#                 User Initializations
241;#
242;##############################################################################
243; You may add any initializations your test code needs here.
244; Registers esi, edi, ebp and r8 - r12 will be unchanged from here to the
245; Test code start.
246
247        finit                ; clear all FP registers
248       
249        CLEARALLXMMREG       ; clear all xmm or ymm registers
250
251        imul eax, [esp], 2020h ; separate data for each thread
252        lea esi, [eax+UserData]
253        lea edi, [esi+120h]
254        xor ebp, ebp
255     
256       
257
258;##############################################################################
259;#
260;#                 End of user Initializations
261;#
262;##############################################################################
263
264        mov     ebx, ThreadData               ; address of first thread data block
265        imul    eax, [esp], THREADDSIZE       ; offset to thread data block
266;        DB      69H, 04H, 24H                 ; fix bug in ml.exe
267;        DD         THREADDSIZE
268        add     ebx, eax                      ; address of current thread data block
269        mov     [esp+4], ebx                  ; save on stack
270
271%if  SUBTRACT_OVERHEAD
272; First test loop. Measure empty code
273        mov     dword [esp+8], 0              ; Loop counter
274
275TEST_LOOP_1:
276
277        SERIALIZE
278
279        mov     ebx, [esp+4]     
280        ; Read counters
281%assign i  0
282%rep    NUM_COUNTERS
283        mov     ecx, [Counters + i*4]
284        rdpmc
285        mov     [ebx + i*4 + 4 + (CountTemp-ThreadData)], eax
286%assign i  i+1
287%endrep
288
289        SERIALIZE
290
291        mov     ebx, [esp+4]     
292        ; read time stamp counter
293        rdtsc
294        mov     [ebx + (CountTemp-ThreadData)], eax
295
296        SERIALIZE
297
298        ; Empty. Test code goes here in next loop
299
300        SERIALIZE
301
302        mov     ebx, [esp+4]     
303        ; read time stamp counter
304        rdtsc
305        sub     [ebx + (CountTemp-ThreadData)], eax        ; CountTemp[0]
306
307        SERIALIZE
308
309        mov     ebx, [esp+4]     
310        ; Read counters
311%assign i  0
312%rep    NUM_COUNTERS
313        mov     ecx, [Counters + i*4]
314        rdpmc
315        sub     [ebx + i*4 + 4 + (CountTemp-ThreadData)], eax  ; CountTemp[i+1]
316%assign i  i+1
317%endrep
318
319        SERIALIZE
320
321        mov     ebx, [esp+4]     
322        ; find minimum counts
323%assign i  0
324%rep    NUM_COUNTERS + 1
325        mov     eax, [ebx+i*4+(CountTemp-ThreadData)]      ; -count
326        neg     eax
327        mov     edx, [ebx+i*4+(CountOverhead-ThreadData)]  ; previous count
328        cmp     eax, edx
329        cmovb   edx, eax
330        mov     [ebx+i*4+(CountOverhead-ThreadData)], edx  ; minimum count       
331%assign i  i+1
332%endrep
333       
334        ; end second test loop
335        inc     dword [esp+8]
336        cmp     dword [esp+8], OVERHEAD_REPETITIONS
337        jb      TEST_LOOP_1
338
339%endif   ; SUBTRACT_OVERHEAD
340
341       
342; Second test loop. Measure user code
343        mov     dword [esp+8], 0          ; Loop counter
344
345TEST_LOOP_2:
346
347        SERIALIZE
348     
349        mov     ebx, [esp+4]     
350        ; Read counters
351%assign i  0
352%rep    NUM_COUNTERS
353        mov     ecx, [Counters + i*4]
354        rdpmc
355        mov     [ebx + i*4 + 4 + (CountTemp-ThreadData)], eax
356%assign i  i+1
357%endrep
358
359        SERIALIZE
360
361        mov     ebx, [esp+4]     
362        ; read time stamp counter
363        rdtsc
364        mov     [ebx + (CountTemp-ThreadData)], eax
365
366        SERIALIZE
367
368;##############################################################################
369;#
370;#                 Test code start
371;#
372;##############################################################################
373
374; Put the assembly code to test here
375
376; œœ
377
378
379%REP 100        ; example: 100 shift instructions
380
381        shr eax, 5
382
383%ENDREP
384
385
386;##############################################################################
387;#
388;#                 Test code end
389;#
390;##############################################################################
391
392        SERIALIZE
393
394        mov     ebx, [esp+4]     
395        ; read time stamp counter
396        rdtsc
397        sub     [ebx + (CountTemp-ThreadData)], eax        ; CountTemp[0]
398
399        SERIALIZE
400
401        mov     ebx, [esp+4]     
402        ; Read counters
403%assign i  0
404%rep    NUM_COUNTERS
405        mov     ecx, [Counters + i*4]
406        rdpmc
407        sub     [ebx + i*4 + 4 + (CountTemp-ThreadData)], eax  ; CountTemp[i+1]
408%assign i  i+1
409%endrep       
410
411        SERIALIZE
412
413        mov     ebx, [esp+4]
414        mov     ecx, [esp+8]
415        ; subtract counts before from counts after
416        mov     eax, [ebx + (CountTemp-ThreadData)]            ; -count
417        neg     eax
418%if     SUBTRACT_OVERHEAD
419        sub     eax, [ebx+(CountOverhead-ThreadData)]   ; overhead clock count       
420%endif  ; SUBTRACT_OVERHEAD       
421        mov     [ebx+ecx*4+(ClockResults-ThreadData)], eax      ; save clock count
422       
423%assign i  0
424%rep    NUM_COUNTERS
425        mov     eax, [ebx + i*4 + 4 + (CountTemp-ThreadData)]
426        neg     eax
427%if     SUBTRACT_OVERHEAD
428        sub     eax, [ebx+i*4+4+(CountOverhead-ThreadData)]   ; overhead pmc count       
429%endif  ; SUBTRACT_OVERHEAD       
430        mov     [ebx+ecx*4+i*4*REPETITIONS+(PMCResults-ThreadData)], eax      ; save count       
431%assign i  i+1
432%endrep       
433       
434        ; end second test loop
435        inc     dword [esp+8]
436        cmp     dword [esp+8], REPETITIONS
437        jb      TEST_LOOP_2
438
439        ; clean up
440        finit
441        cld
442%if USEAVX
443        VZEROALL                       ; clear all ymm registers
444%endif
445
446        push    ds
447        pop     es
448%if NUM_THREADS == 1
449        mov     esp, [ESP_SAVE]        ; Restore stack pointer if only one thread
450%endif
451
452        ; return REPETITIONS;
453        mov     eax, REPETITIONS
454        add     esp, 12
455        pop     ebp
456        pop     edi
457        pop     esi
458        pop     ebx
459        ret
460       
461; End of TestLoop
Note: See TracBrowser for help on using the repository browser.