mxnet
base.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MXNET_BASE_H_
26 #define MXNET_BASE_H_
27 
28 #include "dmlc/base.h"
29 #include <string>
30 #include "dmlc/io.h"
31 #include "dmlc/type_traits.h"
32 #include "dmlc/parameter.h"
33 #include "mshadow/tensor.h"
34 // nnvm headers for symbolic construction.
35 #include "nnvm/op.h"
36 #include "nnvm/symbolic.h"
37 #include "libinfo.h"
38 #include "tuple.h"
39 
40 
45 #if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
46 #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
47 #error "Currently we need g++ 4.8 or higher to fully support c++11 features"
48 #define override
49 #define final
50 #endif
51 #endif
52 
56 #ifdef _MSC_VER
57 #ifdef MXNET_EXPORTS
58 #define MXNET_API __declspec(dllexport)
59 #else
60 #define MXNET_API __declspec(dllimport)
61 #endif
62 #else
63 #define MXNET_API
64 #endif
65 
69 #ifndef MXNET_PREDICT_ONLY
70 #define MXNET_PREDICT_ONLY 0
71 #endif
72 
74 #define MXNET_MAJOR 1
75 
76 #define MXNET_MINOR 7
77 
78 #define MXNET_PATCH 0
79 
80 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
81 
82 #define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
83 
86 #define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)
87 
89 namespace mxnet {
91 typedef mshadow::cpu cpu;
93 typedef mshadow::gpu gpu;
99 using Op = nnvm::Op;
100 
102 struct Context {
104  enum DeviceType {
109  };
113  int32_t dev_id;
115  Context() : dev_type(kCPU), dev_id(0) {}
120  inline DeviceType dev_mask() const {
121  if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
122  return dev_type;
123  }
127  inline int real_dev_id() const {
128  if (dev_type == kCPUPinned || dev_type == kGPU) return dev_id;
129  return 0;
130  }
136  inline bool operator<(const Context &b) const;
142  inline bool operator==(const Context &b) const {
143  return dev_type == b.dev_type && dev_id == b.dev_id;
144  }
150  inline bool operator!=(const Context &b) const {
151  return !(*this == b);
152  }
157  inline void Save(dmlc::Stream *strm) const {
158  strm->Write(&dev_type, sizeof(dev_type));
159  strm->Write(&dev_id, sizeof(dev_id));
160  }
166  inline bool Load(dmlc::Stream *strm) {
167  if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type)) return false;
168  if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t)) return false;
169  return true;
170  }
172  static const int32_t kMaxDevType = 6;
174  static const int32_t kMaxDevID = 16;
180  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
182  inline static Context CPU(int32_t dev_id = 0);
188  inline static Context GPU(int32_t dev_id = -1);
193  inline static int32_t GetGPUCount();
198  inline static bool GPUDriverPresent();
203  inline static int32_t GetGPUStreamsPerWorker();
211  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
217  inline static Context CPUPinned(int32_t dev_id = -1);
223  inline static Context CPUShared(int32_t dev_id = 0);
229  inline static Context FromString(const std::string& str);
230 
231  private:
232 #if MXNET_USE_CUDA
233  static void CudaLibChecks();
234 #endif
235 #if MXNET_USE_CUDNN
236  static void CuDNNLibChecks();
237 #endif
238 };
239 
240 #if MXNET_USE_CUDA
241 
243  public:
248  explicit GPUAuxStream(mshadow::Stream<gpu> *primary_stream) :
249  primary_stream_(primary_stream),
250  aux_stream_(primary_stream),
251  gpu_stream_sync_event_(nullptr) {
252  if (Context::GetGPUStreamsPerWorker() >= 2) {
253  // Create auxiliary stream on the same device with the same properties as the primary stream
254  bool primary_has_blas_handle =
256  bool primary_has_dnn_handle =
258  aux_stream_ = mshadow::NewStream<gpu>(primary_has_blas_handle,
259  primary_has_dnn_handle,
260  primary_stream->dev_id);
261  MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
262  }
263  }
266  // If the aux_stream_ == primary_stream_, then we created no new streams to destroy.
267  if (aux_stream_ != primary_stream_) {
269  MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_));
270  }
271  }
276  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
277  if (aux_stream_ != primary_stream_)
278  StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_);
279  }
284  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
285  if (aux_stream_ != primary_stream_)
286  StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
287  }
289  mshadow::Stream<gpu> *GetStream() { return aux_stream_; }
296  static void StreamSync(mshadow::Stream<gpu> *s1, mshadow::Stream<gpu> *s2, cudaEvent_t event) {
297  MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
298  MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
299  }
300 
301  private:
302  mshadow::Stream<gpu> *primary_stream_;
303  mshadow::Stream<gpu> *aux_stream_;
304  cudaEvent_t gpu_stream_sync_event_;
305 };
306 
316  public:
321  explicit SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
322  gpu_aux_stream_->PreAuxStreamUseSync();
323  }
326  gpu_aux_stream_->PostAuxStreamUseSync();
327  }
329  SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete;
331  void operator=(const SyncedGPUAuxStream&) = delete;
335  SyncedGPUAuxStream& operator=(SyncedGPUAuxStream&&) = default;
337  inline mshadow::Stream<gpu>* GetStream() const {
338  return gpu_aux_stream_->GetStream();
339  }
340 
341  private:
342  GPUAuxStream *gpu_aux_stream_;
343 };
344 #endif // MXNET_USE_CUDA
345 
350 struct RunContext {
356  void *stream;
360  void *aux_stream;
364  bool is_bulk;
370  template<typename xpu>
372  return static_cast<mshadow::Stream<xpu>*>(stream);
373  }
374 #if MXNET_USE_CUDA
375 
380  return SyncedGPUAuxStream(static_cast<GPUAuxStream*>(aux_stream));
381  }
382 #endif
383 
384  inline const Context& get_ctx() const {
385  return ctx;
386  }
387 };
388 } // namespace mxnet
389 
391 namespace mxnet {
392 // implementing Context
393 inline bool Context::operator<(const Context &b) const {
394  if (dev_type == b.dev_type) {
395  return dev_id < b.dev_id;
396  } else {
397  return dev_type < b.dev_type;
398  }
399 }
401  Context ctx;
402  ctx.dev_type = dev_type;
403  ctx.dev_id = dev_id < 0 ? 0 : dev_id;
404  if (dev_type & kGPU) {
405 #if MXNET_USE_CUDA
406  CudaLibChecks();
407 #endif
408 #if MXNET_USE_CUDNN
409  CuDNNLibChecks();
410 #endif
411  if (dev_id < 0) {
412 #if MXNET_USE_CUDA
413  CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
414 #else
415  LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
416 #endif
417  }
418  }
419  return ctx;
420 }
421 inline Context Context::CPU(int32_t dev_id) {
422  return Create(kCPU, dev_id);
423 }
424 
425 inline Context Context::CPUPinned(int32_t dev_id) {
426  return Create(kCPUPinned, dev_id);
427 }
428 
429 inline Context Context::CPUShared(int32_t dev_id) {
430  return Create(kCPUShared, dev_id);
431 }
432 
433 inline Context Context::GPU(int32_t dev_id) {
434  return Create(kGPU, dev_id);
435 }
436 
437 inline bool Context::GPUDriverPresent() {
438 #if MXNET_USE_CUDA
439  int cuda_driver_version = 0;
440  CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
441  return cuda_driver_version > 0;
442 #else
443  return false;
444 #endif
445 }
446 
447 inline int32_t Context::GetGPUCount() {
448 #if MXNET_USE_CUDA
449  if (!GPUDriverPresent()) {
450  return 0;
451  }
452  int32_t count;
453  cudaError_t e = cudaGetDeviceCount(&count);
454  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
455  // This is skipped for working around wheel build system with older CUDA driver.
456  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
457  return 0;
458  }
459  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
460  return count;
461 #else
462  return 0;
463 #endif
464 }
465 
466 inline int32_t Context::GetGPUStreamsPerWorker() {
467  // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS.
468  const int32_t default_num_streams = 1;
469  // The get_aux_stream() interface can supply one additional stream beyond the standard one.
470  static int32_t num_streams =
471  dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1;
472  return num_streams;
473 }
474 
475 inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
476  uint64_t *total_mem) {
477 #if MXNET_USE_CUDA
478 
479  size_t memF, memT;
480  cudaError_t e;
481 
482  int curDevice;
483  e = cudaGetDevice(&curDevice);
484  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
485 
486  e = cudaSetDevice(dev);
487  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
488 
489  e = cudaMemGetInfo(&memF, &memT);
490  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
491 
492  e = cudaSetDevice(curDevice);
493  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
494 
495  *free_mem = static_cast<uint64_t>(memF);
496  *total_mem = static_cast<uint64_t>(memT);
497 
498 #else
499  LOG(FATAL)
500  << "This call is only supported for MXNet built with CUDA support.";
501 #endif
502 }
503 
504 inline Context Context::FromString(const std::string& str) {
505  Context ret;
506  try {
507  const std::string::size_type l = str.find('(');
508  CHECK_NE(l, std::string::npos);
509  const std::string::size_type r = str.find(')');
510  CHECK_EQ(r, str.length()-1);
511 
512  const std::string type = str.substr(0, l);
513  int id = std::stoi(str.substr(l+1, r-l-1));
514  if (type == "cpu") {
515  ret = CPU(id);
516  } else if (type == "gpu") {
517  ret = GPU(id);
518  } else if (type == "cpu_pinned") {
519  ret = CPUPinned(id);
520  } else if (type == "cpu_shared") {
521  ret = CPUShared(id);
522  } else {
523  LOG(FATAL) << "Invalid context string " << str;
524  }
525  } catch (...) {
526  LOG(FATAL) << "Invalid context string " << str;
527  }
528  return ret;
529 }
530 
531 inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
532  if (ctx.dev_type == Context::kCPU) {
533  out << "cpu(";
534  } else if (ctx.dev_type == Context::kGPU) {
535  out << "gpu(";
536  } else if (ctx.dev_type == Context::kCPUPinned) {
537  out << "cpu_pinned(";
538  } else if (ctx.dev_type == Context::kCPUShared) {
539  out << "cpu_shared(";
540  } else {
541  out << "unknown(";
542  }
543  out << ctx.dev_id << ")";
544  return out;
545 }
546 
547 // describe op registration point
548 #define STRINGIZE_DETAIL(x) #x
549 #define STRINGIZE(x) STRINGIZE_DETAIL(x)
550 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
551 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
552 
553 
554 #if MXNET_USE_MKLDNN == 1
555 constexpr size_t kMKLDNNAlign = 64;
556 #endif
557 
558 } // namespace mxnet
559 
560 namespace std {
561 template<> struct hash<mxnet::Context> {
562  size_t operator()(const mxnet::Context& ctx) const {
563  size_t res = 0;
564  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
565  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
566  return res;
567  }
568 };
569 
570 #if __cplusplus < 201402L && !defined(_MSC_VER)
571 template<typename T, typename... Args>
572 inline std::unique_ptr<T> make_unique(Args&&... args) {
573  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
574 }
575 #endif
576 } // namespace std
577 
578 #include "./tensor_blob.h"
580 #endif // MXNET_BASE_H_
DeviceType dev_mask() const
Get corresponding device mask.
Definition: base.h:120
static const int32_t kMaxDevID
the maximal device index
Definition: base.h:174
bool is_bulk
indicator of whether this execution is run in bulk mode
Definition: base.h:364
static cudaStream_t GetStream(Stream< gpu > *stream)
returns actual cudaStream_t given an input GPU stream pointer
Definition: stream_gpu-inl.h:98
HandleState dnn_handle_ownership_
cudnn handle ownership
Definition: stream_gpu-inl.h:61
namespace of mxnet
Definition: api_registry.h:33
static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total)
get the free and total available memory on a GPU
SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream)
constructor.
Definition: base.h:321
Definition: stream_gpu-inl.h:38
mshadow::Stream< xpu > * get_stream() const
get mshadow stream from Context
Definition: base.h:371
void PostAuxStreamUseSync()
Makes future primary stream work wait on the completion of existing aux stream work.
Definition: base.h:283
static int32_t GetGPUCount()
bool Load(dmlc::Stream *strm)
load the content from binary stream
Definition: base.h:166
mshadow::default_real_t real_t
data type that will be used to store ndarray
Definition: base.h:97
static Context GPU(int32_t dev_id=-1)
Definition: optional.h:241
GPUAuxStream(mshadow::Stream< gpu > *primary_stream)
constructor.
Definition: base.h:248
Context ctx
base Context
Definition: base.h:352
bool operator<(const Context &b) const
Comparator, used to enable Context as std::map key.
static const int32_t kMaxDevType
the maximal device type
Definition: base.h:172
execution time context. The information needed in runtime for actual execution.
Definition: base.h:350
interface of stream I/O for serialization
Definition: io.h:30
Holds an auxiliary mshadow gpu stream that can be synced with a primary stream.
Definition: base.h:242
mshadow::Stream< gpu > * GetStream()
Getter for created auxiliary stream.
Definition: base.h:289
Stream< gpu > * NewStream< gpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: stream_gpu-inl.h:211
Provides automatic coordination of an auxilary stream with a primary one. This object, upon construction, prepares an aux stream for use by syncing it with enqueued primary-stream work. Object destruction will sync again so future primary-stream work will wait on enqueued aux-stream work. If MXNET_GPU_WORKER_NSTREAMS == 1, then this defaults simply: the primary stream will equal the aux stream and the syncs will be executed as nops. See ./src/operator/cudnn/cudnn_convolution-inl.h for a usage example.
Definition: base.h:315
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:271
void * aux_stream
the auxiliary stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:360
~GPUAuxStream()
destructor
Definition: base.h:265
DeviceType dev_type
the device type we run the op on
Definition: base.h:111
Definition: base.h:105
static bool GPUDriverPresent()
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
device name CPU
Definition: tensor.h:40
device name GPU
Definition: tensor.h:47
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:51
HandleState blas_handle_ownership_
cudnn handle
Definition: stream_gpu-inl.h:57
static int32_t GetGPUStreamsPerWorker()
size_t HashCombine(size_t key, const T &value)
hash an object and combines the key with previous keys
Definition: common.h:37
int dev_id
dev id
Definition: stream_gpu-inl.h:65
int32_t dev_id
device id we are going to run it on
Definition: base.h:113
Definition: base.h:107
#define MSHADOW_CATCH_ERROR(func)
Run function and catch error, log unknown error.
Definition: base.h:285
int32_t index_t
type that will be used for index
Definition: base.h:336
mshadow::Stream< gpu > * GetStream() const
Getter for underlying mshadow::Stream<gpu>.
Definition: base.h:337
void * stream
the stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:356
void Save(dmlc::Stream *strm) const
save the content into binary stream
Definition: base.h:157
mshadow::gpu gpu
mxnet gpu
Definition: base.h:93
float default_real_t
float point type that will be used in default by mshadow
Definition: base.h:348
const Context & get_ctx() const
get the base Context from RunContext
Definition: base.h:384
Definition: base.h:106
DeviceType
Type of device.
Definition: base.h:104
static Context CPUShared(int32_t dev_id=0)
cudaStream_t stream_
cudaStream
Definition: stream_gpu-inl.h:45
mshadow::cpu cpu
mxnet cpu
Definition: base.h:91
virtual size_t Read(void *ptr, size_t size)=0
reads data from a stream
~SyncedGPUAuxStream()
destructor
Definition: base.h:325
int real_dev_id() const
Returns dev_id for kGPU and kCPUPinned, 0 otherwise.
Definition: base.h:127
nnvm::Op Op
operator structure from NNVM
Definition: base.h:99
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:44
Context()
default constructor
Definition: base.h:115
static Context Create(DeviceType dev_type, int32_t dev_id=-1)
Create a new context.
bool operator!=(const Context &b) const
check if current context not equals another one
Definition: base.h:150
Data structure Tuple and TShape to store dynamic sized shapes.
static Context CPU(int32_t dev_id=0)
virtual void Write(const void *ptr, size_t size)=0
writes data to a stream
SyncedGPUAuxStream get_gpu_aux_stream() const
get an RAII object that transparently handles the syncing of the auxiliary stream.
Definition: base.h:379
static Context CPUPinned(int32_t dev_id=-1)
Operator information structor.
void PreAuxStreamUseSync()
Makes future aux stream work wait on the completion of existing primary stream work.
Definition: base.h:275
Definition: base.h:108
static Context FromString(const std::string &str)
static void StreamSync(mshadow::Stream< gpu > *s1, mshadow::Stream< gpu > *s2, cudaEvent_t event)
Make future work enqueued to s2 wait on completion of current work enqueued to s1.
Definition: base.h:296
mshadow::index_t index_t
index type usually use unsigned
Definition: base.h:95
TBlob class that holds common representation of arbirary dimension tensor, can be used to transformed...
Symbolic graph construction API.
std::ostream & operator<<(std::ostream &os, const optional< T > &t)
serialize an optional object to string.
Definition: optional.h:141
Context information about the execution environment.
Definition: base.h:102
Provide lightweight util to do parameter setup and checking.
type traits information header
Operator structure.
Definition: op.h:103
void DeleteStream< gpu >(Stream< gpu > *stream)
Definition: stream_gpu-inl.h:201
get features of the MXNet library at runtime
bool operator==(const Context &b) const
check if current context equals another one
Definition: base.h:142
computaion stream structure, used for asynchronous computations
Definition: tensor.h:384