内存屏障(Memory Barriers)

起因是最近在看levelDB源码,其中port里的atomic_pointer.h文件用到了内存屏障。。

于是来学习一下。。

粗略得说下我自己的理解。

代码的顺序并不和执行的顺序完全对应,出于对效率的追求,cpu和编译器会对一些顺序指令重排,以期得到最大的执行效率。

比如下面这段代码:

1// example 2
2    // void *ptr, v, _store;
3    v = ptr;
4    _store = v;
5    somefunc();
6    v = _store;

v的值是没有改变的,那么编译器可能会认为_store = v; v = _store; 是多余的,就直接把这一段给“优化”掉了。这段代码在单线程中确实是多余的,但是在多线程环境下,可能在somefunc()被调用的时候,另一个线程把v的值给改变了,而这种情况是编译器无法发现的。因此,为了避免这种情况。。。内存屏障登场!

摘自维基百科:

**内存屏障**,也称**内存栅栏**,**内存栅障**,**屏障指令**等,是一类[同步屏障](https://zh.wikipedia.org/wiki/)指令,是CPU或编译器在对内存随机访问的操作中的一个同步点,使得此点之前的所有读写操作都执行后才可以开始执行此点之后的操作。

大多数现代计算机为了提高性能而采取乱序执行,这使得内存屏障成为必须。

语义上,内存屏障之前的所有写操作都要写入内存;内存屏障之后的读操作都可以获得同步屏障之前的写操作的结果。因此,对于敏感的程序块,写操作之后、读操作之前可以插入内存屏障。

在多线程环境里需要使用某种技术来使程序结果尽快可见。。请先假定一个事实:一旦内存数据被推送到缓存,就会有消息协议来确保所有的缓存会对所有的共享数据同步并保持一致。这个使内存数据对CPU核可见的技术被称为**内存屏障或内存栅栏**。

再看一个例子

1// get start time
2for (int i = 0; i != 100000; i++) {
3    MemoryBarrier()
4}
5// get end time

这段代码,是想知道for循环空转100000次的耗时,这里就需要加入一个MemoryBarrier,如果不加,那么编译器可能就会直接把这个无意义的for循环直接优化掉了。

除了编译器,cpu由于指令流水线或者超流水线等计数,也可能导致出现乱序执行的情况。

内存屏障提供了两个功能。首先,它们通过确保从另一个CPU来看屏障的两边的所有指令都是正确的程序顺序,而保持程序顺序的外部可见性;其次它们可以实现内存数据可见性,确保内存数据会同步到CPU缓存子系统。

不过内存平展由于阻碍了cpu和编译器的部分优化。。。因此对性能的影响是不忽略的。

为了达到最佳性能,最好是把要解决的问题模块化,这样处理器可以按单元执行任务,然后在任务单元的边界放上所有需要的内存屏障。采用这个方法可以让处理器不受限的执行一个任务单元。合理的内存屏障组合还有一个好处是:缓冲区在第一次被刷后开销会减少,因为再填充改缓冲区不需要额外工作了。

内存屏障的实现不同平台差别很大。。。因为我们可以看到atomic_pointer.h文件中 一堆和平台相关的条件编译…

// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
 1// AtomicPointer provides storage for a lock-free pointer.
 2// Platform-dependent implementation of AtomicPointer:
 3// - If the platform provides a cheap barrier, we use it with raw pointers
 4// - If <atomic> is present (on newer versions of gcc, it is), we use
 5//   a <atomic>-based AtomicPointer.  However we prefer the memory
 6//   barrier based version, because at least on a gcc 4.4 32-bit build
 7//   on linux, we have encountered a buggy <atomic> implementation.
 8//   Also, some <atomic> implementations are much slower than a memory-barrier
 9//   based implementation (~16ns for <atomic> based acquire-load vs. ~1ns for
10//   a barrier based acquire-load).
11// This code is based on atomicops-internals-* in Google's perftools:
12// http://code.google.com/p/google-perftools/source/browse/#svntrunksrcbase
#ifndef PORT_ATOMIC_POINTER_H_
#define PORT_ATOMIC_POINTER_H_
 1#include <stdint.h>
 2#ifdef LEVELDB_ATOMIC_PRESENT
 3#include <atomic>
 4#endif
 5#ifdef OS_WIN
 6#include <windows.h>
 7#endif
 8#ifdef OS_MACOSX
 9#include <libkern/OSAtomic.h>
10#endif
 1#if defined(_M_X64) || defined(__x86_64__)
 2#define ARCH_CPU_X86_FAMILY 1
 3#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
 4#define ARCH_CPU_X86_FAMILY 1
 5#elif defined(__ARMEL__)
 6#define ARCH_CPU_ARM_FAMILY 1
 7#elif defined(__aarch64__)
 8#define ARCH_CPU_ARM64_FAMILY 1
 9#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
10#define ARCH_CPU_PPC_FAMILY 1
11#elif defined(__mips__)
12#define ARCH_CPU_MIPS_FAMILY 1
13#endif
namespace leveldb {
namespace port {
1// Define MemoryBarrier() if available
2// Windows on x86
3#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
4// windows.h already provides a MemoryBarrier(void) macro
5// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
6#define LEVELDB_HAVE_MEMORY_BARRIER
1// Mac OS
2#elif defined(OS_MACOSX)
3inline void MemoryBarrier() {
4  OSMemoryBarrier();
5}
6#define LEVELDB_HAVE_MEMORY_BARRIER
1// Gcc on x86
2#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
3inline void MemoryBarrier() {
4  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
5  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
6  __asm__ __volatile__("" : : : "memory");
7}
8#define LEVELDB_HAVE_MEMORY_BARRIER
1// Sun Studio
2#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
3inline void MemoryBarrier() {
4  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
5  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
6  asm volatile("" : : : "memory");
7}
8#define LEVELDB_HAVE_MEMORY_BARRIER
 1// ARM Linux
 2#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
 3typedef void (*LinuxKernelMemoryBarrierFunc)(void);
 4// The Linux ARM kernel provides a highly optimized device-specific memory
 5// barrier function at a fixed memory address that is mapped in every
 6// user-level process.
 7//
 8// This beats using CPU-specific instructions which are, on single-core
 9// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
10// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
11// shows that the extra function call cost is completely negligible on
12// multi-core devices.
13//
14inline void MemoryBarrier() {
15  (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
16}
17#define LEVELDB_HAVE_MEMORY_BARRIER
1// ARM64
2#elif defined(ARCH_CPU_ARM64_FAMILY)
3inline void MemoryBarrier() {
4  asm volatile("dmb sy" : : : "memory");
5}
6#define LEVELDB_HAVE_MEMORY_BARRIER
1// PPC
2#elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__)
3inline void MemoryBarrier() {
4  // TODO for some powerpc expert: is there a cheaper suitable variant?
5  // Perhaps by having separate barriers for acquire and release ops.
6  asm volatile("sync" : : : "memory");
7}
8#define LEVELDB_HAVE_MEMORY_BARRIER
1// MIPS
2#elif defined(ARCH_CPU_MIPS_FAMILY) && defined(__GNUC__)
3inline void MemoryBarrier() {
4  __asm__ __volatile__("sync" : : : "memory");
5}
6#define LEVELDB_HAVE_MEMORY_BARRIER
#endif
 1// AtomicPointer built using platform-specific MemoryBarrier()
 2#if defined(LEVELDB_HAVE_MEMORY_BARRIER)
 3class AtomicPointer {
 4 private:
 5  void* rep_;
 6 public:
 7  AtomicPointer() { }
 8  explicit AtomicPointer(void* p) : rep_(p) {}
 9  inline void* NoBarrier_Load() const { return rep_; }
10  inline void NoBarrier_Store(void* v) { rep_ = v; }
11  inline void* Acquire_Load() const {
12    void* result = rep_;
13    MemoryBarrier();
14    return result;
15  }
16  inline void Release_Store(void* v) {
17    MemoryBarrier();
18    rep_ = v;
19  }
20};
 1// AtomicPointer based on <cstdatomic>
 2#elif defined(LEVELDB_ATOMIC_PRESENT)
 3class AtomicPointer {
 4 private:
 5  std::atomic<void*> rep_;
 6 public:
 7  AtomicPointer() { }
 8  explicit AtomicPointer(void* v) : rep_(v) { }
 9  inline void* Acquire_Load() const {
10    return rep_.load(std::memory_order_acquire);
11  }
12  inline void Release_Store(void* v) {
13    rep_.store(v, std::memory_order_release);
14  }
15  inline void* NoBarrier_Load() const {
16    return rep_.load(std::memory_order_relaxed);
17  }
18  inline void NoBarrier_Store(void* v) {
19    rep_.store(v, std::memory_order_relaxed);
20  }
21};
 1// Atomic pointer based on sparc memory barriers
 2#elif defined(__sparcv9) && defined(__GNUC__)
 3class AtomicPointer {
 4 private:
 5  void* rep_;
 6 public:
 7  AtomicPointer() { }
 8  explicit AtomicPointer(void* v) : rep_(v) { }
 9  inline void* Acquire_Load() const {
10    void* val;
11    __asm__ __volatile__ (
12        "ldx [%[rep_]], %[val] \n\t"
13         "membar #LoadLoad|#LoadStore \n\t"
14        : [val] "=r" (val)
15        : [rep_] "r" (&rep_)
16        : "memory");
17    return val;
18  }
19  inline void Release_Store(void* v) {
20    __asm__ __volatile__ (
21        "membar #LoadStore|#StoreStore \n\t"
22        "stx %[v], [%[rep_]] \n\t"
23        :
24        : [rep_] "r" (&rep_), [v] "r" (v)
25        : "memory");
26  }
27  inline void* NoBarrier_Load() const { return rep_; }
28  inline void NoBarrier_Store(void* v) { rep_ = v; }
29};
 1// Atomic pointer based on ia64 acq/rel
 2#elif defined(__ia64) && defined(__GNUC__)
 3class AtomicPointer {
 4 private:
 5  void* rep_;
 6 public:
 7  AtomicPointer() { }
 8  explicit AtomicPointer(void* v) : rep_(v) { }
 9  inline void* Acquire_Load() const {
10    void* val    ;
11    __asm__ __volatile__ (
12        "ld8.acq %[val] = [%[rep_]] \n\t"
13        : [val] "=r" (val)
14        : [rep_] "r" (&rep_)
15        : "memory"
16        );
17    return val;
18  }
19  inline void Release_Store(void* v) {
20    __asm__ __volatile__ (
21        "st8.rel [%[rep_]] = %[v]  \n\t"
22        :
23        : [rep_] "r" (&rep_), [v] "r" (v)
24        : "memory"
25        );
26  }
27  inline void* NoBarrier_Load() const { return rep_; }
28  inline void NoBarrier_Store(void* v) { rep_ = v; }
29};
// We have neither MemoryBarrier(), nor <atomic>
#else
#error Please implement AtomicPointer for this platform.

#endif

#undef LEVELDB_HAVE_MEMORY_BARRIER
#undef ARCH_CPU_X86_FAMILY
#undef ARCH_CPU_ARM_FAMILY
#undef ARCH_CPU_ARM64_FAMILY
#undef ARCH_CPU_PPC_FAMILY

}  // namespace port
}  // namespace leveldb

#endif  // PORT_ATOMIC_POINTER_H_

参考资料:

内存屏障_维基百科

内存屏障_并发编程网

LINUX内核之内存屏障

Posts in this Series