Causing non-atomics to tear

2019-05-23 03:39发布

问题:

Hi I would like a int and a float example that causes tearing for writing with an non-atomic values. I can't seem to reproduce this. It seems like something that is extremely rare or something I'm doing wrong.

Here is my test code which never prints. Is there anything wrong with it?

#include <windows.h>
#include <tchar.h>
#include <strsafe.h>

#define MAX_THREADS 64
#define BUF_SIZE 255

DWORD WINAPI MyThreadFunction( LPVOID lpParam );
void ErrorHandler(LPTSTR lpszFunction);

// Sample custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
typedef struct MyData {
    int val1;
    int val2;
} MYDATA, *PMYDATA;


int _tmain()
{
    DWORD   dwThreadIdArray[MAX_THREADS];
    HANDLE  hThreadArray[MAX_THREADS]; 

    // Create MAX_THREADS worker threads.

    for( int i=0; i<MAX_THREADS; i++ )
    {
        // Allocate memory for thread data.
        // Create the thread to begin execution on its own.

        hThreadArray[i] = CreateThread( 
            NULL,                   // default security attributes
            0,                      // use default stack size  
            MyThreadFunction,       // thread function name
            NULL,                   // argument to thread function 
            0,                      // use default creation flags 
            &dwThreadIdArray[i]);   // returns the thread identifier 


        // Check the return value for success.
        // If CreateThread fails, terminate execution. 
        // This will automatically clean up threads and memory. 

        if (hThreadArray[i] == NULL) 
        {
            ErrorHandler(TEXT("CreateThread"));
            ExitProcess(3);
        }
    } // End of main thread creation loop.

    // Wait until all threads have terminated.

    WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);

    // Close all thread handles and free memory allocations.

    for(int i=0; i<MAX_THREADS; i++)
    {
        CloseHandle(hThreadArray[i]);
    }

    return 0;
}

#pragma pack(push, 1)
struct Test
{
    char x1;
    char x1;
    char x3;
    int test;
    char x4;
    char x5;
};

Test* t = new Test(); //This is test code don't care about allocation or that it is a global.
#pragma pack(pop)

DWORD WINAPI MyThreadFunction( LPVOID lpParam ) 
{ 
    HANDLE hStdout;

    TCHAR msgBuf[BUF_SIZE];
    size_t cchStringSize;
    DWORD dwChars;

    // Make sure there is a console to receive output results. 

    hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
    if( hStdout == INVALID_HANDLE_VALUE )
        return 1;

    static int thread = 0;
    StringCchPrintf(msgBuf, BUF_SIZE, TEXT("Starting thread, %d\n"), ++thread); 
    StringCchLength(msgBuf, BUF_SIZE, &cchStringSize);
    WriteConsole(hStdout, msgBuf, (DWORD)cchStringSize, &dwChars, NULL);

    t->test = 1;

    for (int i=0; i<1000000000;++i)
    {
        t->test = 1;
        t->test = 10000;
        t->test = 10000000;

        int result = t->test;

        if(result != 1 && result != 10000 && result != 10000000)
        {
            StringCchPrintf(msgBuf, BUF_SIZE, TEXT("Tearing occured = %d\n"), result); 
            StringCchLength(msgBuf, BUF_SIZE, &cchStringSize);
            WriteConsole(hStdout, msgBuf, (DWORD)cchStringSize, &dwChars, NULL);
        }
    }


    return 0; 
} 



void ErrorHandler(LPTSTR lpszFunction) 
{ 
    // Retrieve the system error message for the last-error code.

    LPVOID lpMsgBuf;
    LPVOID lpDisplayBuf;
    DWORD dw = GetLastError(); 

    FormatMessage(
        FORMAT_MESSAGE_ALLOCATE_BUFFER | 
        FORMAT_MESSAGE_FROM_SYSTEM |
        FORMAT_MESSAGE_IGNORE_INSERTS,
        NULL,
        dw,
        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
        (LPTSTR) &lpMsgBuf,
        0, NULL );

    // Display the error message.

    lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT, 
        (lstrlen((LPCTSTR) lpMsgBuf) + lstrlen((LPCTSTR) lpszFunction) + 40) * sizeof(TCHAR)); 
    StringCchPrintf((LPTSTR)lpDisplayBuf, 
        LocalSize(lpDisplayBuf) / sizeof(TCHAR),
        TEXT("%s failed with error %d: %s"), 
        lpszFunction, dw, lpMsgBuf); 
    MessageBox(NULL, (LPCTSTR) lpDisplayBuf, TEXT("Error"), MB_OK); 

    // Free error-handling buffer allocations.

    LocalFree(lpMsgBuf);
    LocalFree(lpDisplayBuf);
}

回答1:

I can trigger torn reads / writes with this test code which forces the contended uint32_t to cross a cache line boundary when compiled with Visual Studio 2013 (only seems to happen in Release builds):

#include <algorithm>
#include <atomic>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <thread>
#include <vector>

using namespace std;

atomic<bool> gDone = false;

vector<uint32_t> vals = {0x11111111, 0x22222222, 0x33333333, 0x44444444, };

mutex ioMutex;

void writeVal(volatile uint32_t* pVal, int tid) {
    while (!gDone) {
        *pVal = vals[tid];
        const auto currentVal = *pVal;
        auto findIt = find(begin(vals), end(vals), currentVal);
        if (findIt == end(vals)) {
            unique_lock<mutex> ul(ioMutex);
            cout << "Detected torn read/write! pVal = 0x" << setbase(16) << setw(8) << setfill('0')
                 << reinterpret_cast<uintptr_t>(pVal) << " currentVal = 0x" << currentVal << endl;
            gDone = true;
        }
    }
}

int main() {
    vector<char> memVec(16 * 1024);
    char* first = &memVec[0];
    const auto cacheLineSize = 64;
    char* crossesCacheLine =
        reinterpret_cast<char*>((reinterpret_cast<uintptr_t>(first + cacheLineSize) & ~(cacheLineSize - 1)) - 2);
    uint32_t* tearableUint32 = reinterpret_cast<uint32_t*>(crossesCacheLine);
    vector<thread> threads(vals.size());
    for (int i = 0; i != threads.size(); ++i) {
        threads[i] = thread([=] { writeVal(tearableUint32, i); });
    }
    for (auto& t : threads) {
        t.join();
    }
}

Output:

Detected torn read/write! pVal = 0x004bc43e currentVal = 0x11112222


回答2:

FWIW, this is just additional info for the previous answer, someone with higher stackoverflow privileges than me may just move it to comments for previous question.

I just checked the addresses that actually give tearing, and as expected, the address is 62 bytes into the cache line, so the 32-bit value gets written to the last two bytes of one cache line and to the first two bytes of another. See gdb output below.

alapaa@hilbert:~/src/stackoverflow$ g++ -g -std=c++0x tear.cpp -pthread -o tear  

alapaa@hilbert:~/src/stackoverflow$ ./tear  

Detected torn read/write! pVal = 0x00c0503e currentVal = 0x33331111  
Detected torn read/write! pVal = 0x00c0503e currentVal = 0x44441111  
alapaa@hilbert:~/src/stackoverflow$ gdb  
GNU gdb (Ubuntu 7.7-0ubuntu3.1) 7.7  
Copyright (C) 2014 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".

(gdb) p 0x00c0503e % 64  
$1 = 62