Disassebly of dispatch_async_f after fix:
0000000000006d50 <dispatch_async_f>: 6d50: 55 push %rbp 6d51: 48 89 e5 mov %rsp,%rbp 6d54: 48 89 5d d8 mov %rbx,0xffffffffffffffd8(%rbp) 6d58: 4c 89 65 e0 mov %r12,0xffffffffffffffe0(%rbp) 6d5c: 48 89 fb mov %rdi,%rbx 6d5f: 4c 89 6d e8 mov %r13,0xffffffffffffffe8(%rbp) 6d63: 4c 89 75 f0 mov %r14,0xfffffffffffffff0(%rbp) 6d67: 49 89 f4 mov %rsi,%r12 6d6a: 4c 89 7d f8 mov %r15,0xfffffffffffffff8(%rbp) 6d6e: 48 83 ec 30 sub $0x30,%rsp 6d72: 4c 8b 2d cf 3b 01 00 mov 80847(%rip),%r13 # 1a948 <_GLOBAL_OFFSET_TABLE_+0x168> 6d79: 49 89 d6 mov %rdx,%r14 6d7c: 41 8b 7d 00 mov 0x0(%r13),%edi 6d80: e8 a3 e2 ff ff callq 5028 <pthread_getspecific@plt> 6d85: 48 85 c0 test %rax,%rax 6d88: 49 89 c7 mov %rax,%r15 6d8b: 74 4a je 6dd7 <dispatch_async_f+0x87> 6d8d: 48 8b 70 08 mov 0x8(%rax),%rsi 6d91: 41 8b 7d 00 mov 0x0(%r13),%edi 6d95: e8 b6 f0 ff ff callq 5e50 <_dispatch_thread_setspecific> 6d9a: 49 c7 07 01 00 00 00 movq $0x1,(%r15) 6da1: 4d 89 77 10 mov %r14,0x10(%r15) 6da5: 4d 89 67 18 mov %r12,0x18(%r15) 6da9: 49 c7 47 08 00 00 00 movq $0x0,0x8(%r15) // <-------------------------------- SBN: (1) tail->do_next = NULL 6db0: 00 6db1: 4c 89 f8 mov %r15,%rax 6db4: 48 87 43 40 xchg %rax,0x40(%rbx) // <-------------------------------- SBN: (2) dispatch_atomic_xchg(&dq->dq_items_tail, tail) 6db8: 48 85 c0 test %rax,%rax 6dbb: 74 3d je 6dfa <dispatch_async_f+0xaa> 6dbd: 4c 89 78 08 mov %r15,0x8(%rax) 6dc1: 48 8b 5d d8 mov 0xffffffffffffffd8(%rbp),%rbx 6dc5: 4c 8b 65 e0 mov 0xffffffffffffffe0(%rbp),%r12 6dc9: 4c 8b 6d e8 mov 0xffffffffffffffe8(%rbp),%r13 6dcd: 4c 8b 75 f0 mov 0xfffffffffffffff0(%rbp),%r14 6dd1: 4c 8b 7d f8 mov 0xfffffffffffffff8(%rbp),%r15 6dd5: c9 leaveq 6dd6: c3 retq 6dd7: 4c 89 f2 mov %r14,%rdx 6dda: 4c 89 e6 mov %r12,%rsi 6ddd: 48 89 df mov %rbx,%rdi 6de0: 4c 8b 65 e0 mov 0xffffffffffffffe0(%rbp),%r12 6de4: 48 8b 5d d8 mov 0xffffffffffffffd8(%rbp),%rbx 6de8: 4c 8b 6d e8 mov 0xffffffffffffffe8(%rbp),%r13 6dec: 4c 8b 75 f0 mov 0xfffffffffffffff0(%rbp),%r14 6df0: 4c 8b 7d f8 mov 0xfffffffffffffff8(%rbp),%r15 6df4: c9 leaveq 6df5: e9 e6 fe ff ff jmpq 6ce0 <_dispatch_async_f_slow> 6dfa: 4c 89 fe mov %r15,%rsi 6dfd: 48 89 df mov %rbx,%rdi 6e00: 4c 8b 65 e0 mov 0xffffffffffffffe0(%rbp),%r12 6e04: 48 8b 5d d8 mov 0xffffffffffffffd8(%rbp),%rbx 6e08: 4c 8b 6d e8 mov 0xffffffffffffffe8(%rbp),%r13 6e0c: 4c 8b 75 f0 mov 0xfffffffffffffff0(%rbp),%r14 6e10: 4c 8b 7d f8 mov 0xfffffffffffffff8(%rbp),%r15 6e14: c9 leaveq 6e15: e9 4e e2 ff ff jmpq 5068 <_dispatch_queue_push_list_slow@plt> 6e1a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
As one can see all stores now are performed in correct order.