Windows multicore threads enter their function but

2019-08-21 15:41发布

问题:

For several days I have had trouble with creating four threads (for four cores) on a Windows 7 Ivy Bridge system. I have created a simple test function in NASM, as minimal as it can be, to demonstrate.

The program increments a counter from 0 to 1 billion and returns the result. For multicore processing, I divided it so core 1 counts from 0 to 2.5 million, core 2 counts from 2.5 million to 5 million, etc.

For testing purposes. the program returns a 12-element array with three values from each core. Elements 1-4 are the thread handle returned by CreateThread; elements 5-8 are the return value from the call GetCurrentThreadId after the thread enters the test function (Test_fn); and elements 9-12 are the results of the calculations performed by each thread (counting from startbyte to endbyte).

The array returned (TestInfo) shows that CreateThread always succeeds, as the first four elements always contain the thread handle returned by CreateThread (in my tests), and elements 5-8 always return a value from GetCurrentThreadId on entry to the function Test_fn. The problem is that elements 9-12 (the calculation results from each core) do not always return a value from the math calculation in the body of Test_fn, and the distribution is random (sometimes cores 1 and 4 will succeed, sometimes cores 2 and 3 only, etc). That means threads do not always successfully run their assigned calculations even though they are created and call Test_fn.

This is a dll called from Python, but it could be called from C or C++. It takes no arguments, and it returns a pointer to the 12-element test array TestInfo (described above). The entry point is Main_Entry_fn, which calls Init_Cores_fn. Threads are pointed to Test_fn.

So my question is why do threads not always reliably return a value from Test_fn even though the threads clearly call Test_fn.

; Header Section
[BITS 64]
[default rel]

extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThreadId

section .data align=16
const_1000000000: dq 1000000000
ThreadID:  dq 0
TestInfo: times 12 dq 0
ThreadInfo: times 3 dq 0
ThreadInfo2: times 3 dq 0
ThreadInfo3: times 3 dq 0
ThreadInfo4: times 3 dq 0
ThreadHandles: times 4 dq 0
Division_Size: dq 0
Start_Byte: dq 0
End_Byte: dq 0
Return_Data_Array: times 4 dq 0
Core_Number: dq 0

section .text

; ______________________________________

Init_Cores_fn:

; Calculate the data divisions
mov rax,[const_1000000000]
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [End_Byte],rax
mov [Division_Size],rax
mov rax,0
mov [Start_Byte],rax

; Populate the ThreadInfo arrays to pass for each core
; ThreadInfo:  (1) startbyte; (2) endbyte; (3) Core_Number (0, 8, 16, 24)
mov rdi,ThreadInfo
mov rax,[Start_Byte]
mov [rdi],rax
mov rax,[End_Byte]
mov [rdi+8],rax
mov rax,[Core_Number]
mov [rdi+16],rax

call DupThreadInfo ; Create ThreadInfo arrays for cores 2-4

mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)

; _____

label_0:

mov rax,[Core_Number]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:

; _____
; Create Threads

mov rcx,0               ; lpThreadAttributes (Security Attributes)
mov rdx,0               ; dwStackSize
mov r8,Test_fn          ; lpStartAddress (function pointer)
mov r9,rdi              ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax            ; use default creation flags
mov rdi,ThreadID
mov [rsp+40],rdi            ; ThreadID

call CreateThread

; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[Core_Number]
mov [rdi+rcx],rax
mov rdi,TestInfo
mov [rdi+rcx],rax

mov rax,[Core_Number]
add rax,8
mov [Core_Number],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0

; _____
; Wait

mov rcx,4 ;rax          ; number of handles
mov rdx,ThreadHandles       ; pointer to handles array
mov r8,0                ; wait for all threads to complete
mov r9,5000         ; milliseconds to wait

call WaitForMultipleObjects

; _____

mov rsp,rbp
jmp label_900

; ______________________________________

Test_fn:

;______

; GetCurrentThreadId
mov rdi,rcx
push rcx
call GetCurrentThreadId
mov rcx,[rdi+16] ; startbyte
mov rdi,TestInfo
mov [rdi+rcx+32],rax
pop rcx

;______

mov rdi,rcx

mov r14,[rdi] ; Start_Byte
mov r15,[rdi+8] ; End_Byte
mov r13,[rdi+16] ; Core_Number

;______

label_401:
cmp r14,r15
jge label_899

; n += 1
add r14,1

jmp label_401

;______

label_899:

mov rdi,Return_Data_Array
mov [rdi+r13],r14

mov rdi,TestInfo
mov [rdi+r13+64],r14

mov rbp,ThreadHandles
mov rax,[rbp+r13]
;mov [rdi+rbx+64],rax

call ExitThread

ret

; __________

label_900:

mov rdi,ThreadHandles
mov r8,0
label_900_01:
mov rcx,[rdi+r8]
call CloseHandle
add r8,8
cmp r8,32
jl label_900_01

mov rdi,TestInfo
mov rax,rdi

ret

; __________
; Main Entry

Main_Entry_fn:
push rdi
push rbp
call Init_Cores_fn
pop rbp
pop rdi
ret

DupThreadInfo:
mov rdi,ThreadInfo2
mov rax,8
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
add rax,[Division_Size]
mov [rdi],rax
mov rax,[End_Byte]
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax

mov rdi,ThreadInfo3
mov rax,16
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax

mov rdi,ThreadInfo4
mov rax,24
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
ret

The cores use the 3-element arrays ThreadInfo, ThreadInfo2, ThreadInfo3 and ThreadInfo4 for their data. For each thread, they contain start number, end number, and the core number multiplied by 8:

0 250000000 0

250000000 500000000 8

500000000 750000000 16

750000000 1000000000 24

Here are the results of four separate tests:

1548 1716 1688 1768 / 6460 6464 6468 6472 / 250000000 0 0 1000000000

1744 860 1724 1668 / 6780 6784 6788 6792 / 0 0 0 1000000000

1632 1588 1488 872 / 7024 7028 7032 7036 / 0 500000000 0 0

1740 1732 1684 1536 / 6876 6884 6888 6880 / 250000000 0 750000000 0

The first four numbers are the thread handles for each core; the second four numbers are the return value from GetCurrentThreadId on entry to Test_fn, and the final set of four numbers are the results of the simple calculations performed in Test_fn; they show that cores return the correct data in some but not all cases.

Thanks for any ideas.