mxnet
base.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
24 #ifndef MXNET_BASE_H_
25 #define MXNET_BASE_H_
26 
27 #include "dmlc/base.h"
28 #include <string>
29 #include "dmlc/io.h"
30 #include "dmlc/type_traits.h"
31 #include "dmlc/parameter.h"
32 #include "mshadow/tensor.h"
33 // nnvm headers for symbolic construction.
34 #include "nnvm/op.h"
35 #include "nnvm/symbolic.h"
36 #include "libinfo.h"
37 #include "tuple.h"
38 
42 #ifdef _MSC_VER
43 #ifdef MXNET_EXPORTS
44 #define MXNET_API __declspec(dllexport)
45 #else
46 #define MXNET_API __declspec(dllimport)
47 #endif
48 #else
49 #define MXNET_API
50 #endif
51 
55 #ifndef MXNET_PREDICT_ONLY
56 #define MXNET_PREDICT_ONLY 0
57 #endif
58 
60 #define MXNET_MAJOR 2
61 
62 #define MXNET_MINOR 0
63 
64 #define MXNET_PATCH 0
65 
66 #define MXNET_VERSION (MXNET_MAJOR * 10000 + MXNET_MINOR * 100 + MXNET_PATCH)
67 
68 #define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
69 
72 #define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)
73 
75 namespace mxnet {
77 typedef mshadow::cpu cpu;
79 typedef mshadow::gpu gpu;
87 using Op = nnvm::Op;
88 
90 struct Context {
92  enum DeviceType {
97  };
101  int32_t dev_id;
108  inline DeviceType dev_mask() const {
109  if (dev_type == kCPUPinned || dev_type == kCPUShared)
110  return kCPU;
111  return dev_type;
112  }
116  inline int real_dev_id() const {
117  if (dev_type == kCPUPinned || dev_type == kGPU)
118  return dev_id;
119  return 0;
120  }
126  inline bool operator<(const Context& b) const;
132  inline bool operator==(const Context& b) const {
133  return dev_type == b.dev_type && dev_id == b.dev_id;
134  }
140  inline bool operator!=(const Context& b) const {
141  return !(*this == b);
142  }
147  inline void Save(dmlc::Stream* strm) const {
148  strm->Write(&dev_type, sizeof(dev_type));
149  strm->Write(&dev_id, sizeof(dev_id));
150  }
156  inline bool Load(dmlc::Stream* strm) {
157  if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type))
158  return false;
159  if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t))
160  return false;
161  return true;
162  }
164  static const int32_t kMaxDevType = 6;
166  static const int32_t kMaxDevID = 16;
172  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
174  inline static Context CPU(int32_t dev_id = 0);
180  inline static Context GPU(int32_t dev_id = -1);
185  inline static int32_t GetGPUCount();
190  inline static bool GPUDriverPresent();
195  inline static int32_t GetGPUStreamsPerWorker();
203  inline static void GetGPUMemoryInformation(int dev, uint64_t* free, uint64_t* total);
209  inline static Context CPUPinned(int32_t dev_id = -1);
215  inline static Context CPUShared(int32_t dev_id = 0);
221  inline static Context FromString(const std::string& str);
222 
223  private:
224 #if MXNET_USE_CUDA
225  static void CudaLibChecks();
226 #endif
227 #if MXNET_USE_CUDNN
228  static void CuDNNLibChecks();
229 #endif
230 };
231 
232 #if MXNET_USE_CUDA
233 
235  public:
240  explicit GPUAuxStream(mshadow::Stream<gpu>* primary_stream)
241  : primary_stream_(primary_stream),
242  aux_stream_(primary_stream),
243  gpu_stream_sync_event_(nullptr) {
244  if (Context::GetGPUStreamsPerWorker() >= 2) {
245  // Create auxiliary stream on the same device with the same properties as the primary stream
246  bool primary_has_blas_handle =
248  bool primary_has_dnn_handle =
250  aux_stream_ = mshadow::NewStream<gpu>(
251  primary_has_blas_handle, primary_has_dnn_handle, primary_stream->dev_id);
252  MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
253  }
254  }
257  // If the aux_stream_ == primary_stream_, then we created no new streams to destroy.
258  if (aux_stream_ != primary_stream_) {
260  MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_));
261  }
262  }
267  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
268  if (aux_stream_ != primary_stream_)
269  StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_);
270  }
275  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
276  if (aux_stream_ != primary_stream_)
277  StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
278  }
281  return aux_stream_;
282  }
289  static void StreamSync(mshadow::Stream<gpu>* s1, mshadow::Stream<gpu>* s2, cudaEvent_t event) {
290  MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
291  MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
292  }
293 
294  private:
295  mshadow::Stream<gpu>* primary_stream_;
296  mshadow::Stream<gpu>* aux_stream_;
297  cudaEvent_t gpu_stream_sync_event_;
298 };
299 
309  public:
314  explicit SyncedGPUAuxStream(GPUAuxStream* gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
315  gpu_aux_stream_->PreAuxStreamUseSync();
316  }
319  gpu_aux_stream_->PostAuxStreamUseSync();
320  }
322  SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete;
324  void operator=(const SyncedGPUAuxStream&) = delete;
330  inline mshadow::Stream<gpu>* GetStream() const {
331  return gpu_aux_stream_->GetStream();
332  }
333 
334  private:
335  GPUAuxStream* gpu_aux_stream_;
336 };
337 #endif // MXNET_USE_CUDA
338 
343 struct RunContext {
349  void* stream;
353  void* aux_stream;
357  void* event_pool = nullptr;
363  template <typename xpu>
365  return static_cast<mshadow::Stream<xpu>*>(stream);
366  }
367 #if MXNET_USE_CUDA
368 
373  return SyncedGPUAuxStream(static_cast<GPUAuxStream*>(aux_stream));
374  }
375 #endif
376 
377  inline const Context& get_ctx() const {
378  return ctx;
379  }
380 };
381 } // namespace mxnet
382 
384 namespace mxnet {
385 // implementing Context
386 inline bool Context::operator<(const Context& b) const {
387  if (dev_type == b.dev_type) {
388  return dev_id < b.dev_id;
389  } else {
390  return dev_type < b.dev_type;
391  }
392 }
393 inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
394  Context ctx;
395  ctx.dev_type = dev_type;
396  ctx.dev_id = dev_id < 0 ? 0 : dev_id;
397  if (dev_type & kGPU) {
398 #if MXNET_USE_CUDA
399  CudaLibChecks();
400 #endif
401 #if MXNET_USE_CUDNN
402  CuDNNLibChecks();
403 #endif
404  if (dev_id < 0) {
405 #if MXNET_USE_CUDA
406  CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
407 #else
408  LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
409 #endif
410  }
411  }
412  return ctx;
413 }
414 inline Context Context::CPU(int32_t dev_id) {
415  return Create(kCPU, dev_id);
416 }
417 
418 inline Context Context::CPUPinned(int32_t dev_id) {
419  return Create(kCPUPinned, dev_id);
420 }
421 
422 inline Context Context::CPUShared(int32_t dev_id) {
423  return Create(kCPUShared, dev_id);
424 }
425 
426 inline Context Context::GPU(int32_t dev_id) {
427  return Create(kGPU, dev_id);
428 }
429 
430 inline bool Context::GPUDriverPresent() {
431 #if MXNET_USE_CUDA
432  int cuda_driver_version = 0;
433  CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
434  return cuda_driver_version > 0;
435 #else
436  return false;
437 #endif
438 }
439 
440 inline int32_t Context::GetGPUCount() {
441 #if MXNET_USE_CUDA
442  if (!GPUDriverPresent()) {
443  return 0;
444  }
445  int32_t count;
446  cudaError_t e = cudaGetDeviceCount(&count);
447  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
448  // This is skipped for working around wheel build system with older CUDA driver.
449  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
450  return 0;
451  }
452  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
453  return count;
454 #else
455  return 0;
456 #endif
457 }
458 
459 inline int32_t Context::GetGPUStreamsPerWorker() {
460  // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS.
461  const int32_t default_num_streams = 1;
462  // The get_aux_stream() interface can supply one additional stream beyond the standard one.
463  static int32_t num_streams =
464  dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1;
465  return num_streams;
466 }
467 
468 inline void Context::GetGPUMemoryInformation(int dev, uint64_t* free_mem, uint64_t* total_mem) {
469 #if MXNET_USE_CUDA
470 
471  size_t memF, memT;
472  cudaError_t e;
473 
474  int curDevice;
475  e = cudaGetDevice(&curDevice);
476  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
477 
478  e = cudaSetDevice(dev);
479  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
480 
481  e = cudaMemGetInfo(&memF, &memT);
482  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
483 
484  e = cudaSetDevice(curDevice);
485  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
486 
487  *free_mem = static_cast<uint64_t>(memF);
488  *total_mem = static_cast<uint64_t>(memT);
489 
490 #else
491  LOG(FATAL) << "This call is only supported for MXNet built with CUDA support.";
492 #endif
493 }
494 
495 inline Context Context::FromString(const std::string& str) {
496  Context ret;
497  try {
498  const std::string::size_type l = str.find('(');
499  CHECK_NE(l, std::string::npos);
500  const std::string::size_type r = str.find(')');
501  CHECK_EQ(r, str.length() - 1);
502 
503  const std::string type = str.substr(0, l);
504  int id = std::stoi(str.substr(l + 1, r - l - 1));
505  if (type == "cpu") {
506  ret = CPU(id);
507  } else if (type == "gpu") {
508  ret = GPU(id);
509  } else if (type == "cpu_pinned") {
510  ret = CPUPinned(id);
511  } else if (type == "cpu_shared") {
512  ret = CPUShared(id);
513  } else {
514  LOG(FATAL) << "Invalid context string " << str;
515  }
516  } catch (...) {
517  LOG(FATAL) << "Invalid context string " << str;
518  }
519  return ret;
520 }
521 
522 inline std::ostream& operator<<(std::ostream& out, const Context& ctx) {
523  if (ctx.dev_type == Context::kCPU) {
524  out << "cpu(";
525  } else if (ctx.dev_type == Context::kGPU) {
526  out << "gpu(";
527  } else if (ctx.dev_type == Context::kCPUPinned) {
528  out << "cpu_pinned(";
529  } else if (ctx.dev_type == Context::kCPUShared) {
530  out << "cpu_shared(";
531  } else {
532  out << "unknown(";
533  }
534  out << ctx.dev_id << ")";
535  return out;
536 }
537 
538 // describe op registration point
539 #define STRINGIZE_DETAIL(x) #x
540 #define STRINGIZE(x) STRINGIZE_DETAIL(x)
541 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
542 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
543 
544 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
545 constexpr size_t kDNNLAlign = 64;
546 #endif
547 
548 } // namespace mxnet
549 
550 namespace std {
551 template <>
552 struct hash<mxnet::Context> {
553  size_t operator()(const mxnet::Context& ctx) const {
554  size_t res = 0;
555  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
556  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
557  return res;
558  }
559 };
560 
561 #if __cplusplus < 201402L && !defined(_MSC_VER)
562 template <typename T, typename... Args>
563 inline std::unique_ptr<T> make_unique(Args&&... args) {
564  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
565 }
566 #endif
567 } // namespace std
568 
569 #include "./tensor_blob.h"
571 #endif // MXNET_BASE_H_
mxnet
namespace of mxnet
Definition: api_registry.h:33
mxnet::Context::kCPU
@ kCPU
Definition: base.h:93
mxnet::SyncedGPUAuxStream::operator=
void operator=(const SyncedGPUAuxStream &)=delete
copy assignment operator deleted to prevent unexpected synchronizations.
mshadow::Stream
computaion stream structure, used for asynchronous computations
Definition: tensor.h:488
mxnet::SyncedGPUAuxStream
Provides automatic coordination of an auxilary stream with a primary one. This object,...
Definition: base.h:308
mshadow::DeleteStream< gpu >
void DeleteStream< gpu >(Stream< gpu > *stream)
Definition: stream_gpu-inl.h:255
mxnet::Context::GetGPUMemoryInformation
static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total)
get the free and total available memory on a GPU
mxnet::GPUAuxStream::~GPUAuxStream
~GPUAuxStream()
destructor
Definition: base.h:256
mxnet::Context::kMaxDevType
static const int32_t kMaxDevType
the maximal device type
Definition: base.h:164
libinfo.h
get features of the MXNet library at runtime
tensor_blob.h
TBlob class that holds common representation of arbirary dimension tensor, can be used to transformed...
mshadow::Stream< gpu >::stream_
cudaStream_t stream_
cudaStream
Definition: stream_gpu-inl.h:44
MSHADOW_CUDA_CALL
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:264
parameter.h
Provide lightweight util to do parameter setup and checking.
mxnet::Context::dev_type
DeviceType dev_type
the device type we run the op on
Definition: base.h:99
mxnet::Context::GetGPUCount
static int32_t GetGPUCount()
op.h
Operator information structor.
base.h
defines configuration macros
mxnet::GPUAuxStream
Holds an auxiliary mshadow gpu stream that can be synced with a primary stream.
Definition: base.h:234
mxnet::Context::operator==
bool operator==(const Context &b) const
check if current context equals another one
Definition: base.h:132
mxnet::RunContext
execution time context. The information needed in runtime for actual execution.
Definition: base.h:343
mshadow::cpu::kDevMask
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:43
mshadow::Stream< gpu >::dev_id
int dev_id
dev id
Definition: stream_gpu-inl.h:71
mxnet::Context::GPU
static Context GPU(int32_t dev_id=-1)
mxnet::Context::GetGPUStreamsPerWorker
static int32_t GetGPUStreamsPerWorker()
dmlc::Stream::Write
virtual void Write(const void *ptr, size_t size)=0
writes data to a stream
mxnet::SyncedGPUAuxStream::GetStream
mshadow::Stream< gpu > * GetStream() const
Getter for underlying mshadow::Stream<gpu>.
Definition: base.h:330
mxnet::Context::Load
bool Load(dmlc::Stream *strm)
load the content from binary stream
Definition: base.h:156
mxnet::Context::kCPUShared
@ kCPUShared
Definition: base.h:96
mxnet::Context::dev_mask
DeviceType dev_mask() const
Get corresponding device mask.
Definition: base.h:108
mshadow::gpu
device name GPU
Definition: tensor.h:46
mshadow::cpu
device name CPU
Definition: tensor.h:39
dmlc::HashCombine
size_t HashCombine(size_t key, const T &value)
hash an object and combines the key with previous keys
Definition: common.h:37
mxnet::Context::operator<
bool operator<(const Context &b) const
Comparator, used to enable Context as std::map key.
tensor.h
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
mxnet::lapack_index_t
mshadow::lapack_index_t lapack_index_t
index type for blas library.
Definition: base.h:83
mxnet::Context::DeviceType
DeviceType
Type of device.
Definition: base.h:92
mxnet::Context::kCPUPinned
@ kCPUPinned
Definition: base.h:95
mxnet::Context::GPUDriverPresent
static bool GPUDriverPresent()
mxnet::Context::CPUShared
static Context CPUShared(int32_t dev_id=0)
mshadow::Stream< gpu >
Definition: stream_gpu-inl.h:37
mshadow::NewStream< gpu >
Stream< gpu > * NewStream< gpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: stream_gpu-inl.h:266
mxnet::SyncedGPUAuxStream::~SyncedGPUAuxStream
~SyncedGPUAuxStream()
destructor
Definition: base.h:318
dmlc::operator<<
std::ostream & operator<<(std::ostream &os, const optional< T > &t)
serialize an optional object to string.
Definition: optional.h:151
symbolic.h
Symbolic graph construction API.
mxnet::GPUAuxStream::StreamSync
static void StreamSync(mshadow::Stream< gpu > *s1, mshadow::Stream< gpu > *s2, cudaEvent_t event)
Make future work enqueued to s2 wait on completion of current work enqueued to s1.
Definition: base.h:289
mshadow::index_t
int32_t index_t
type that will be used for index
Definition: base.h:328
mxnet::GPUAuxStream::GPUAuxStream
GPUAuxStream(mshadow::Stream< gpu > *primary_stream)
constructor.
Definition: base.h:240
mshadow::Stream< gpu >::blas_handle_ownership_
HandleState blas_handle_ownership_
cudnn handle
Definition: stream_gpu-inl.h:60
mxnet::RunContext::get_stream
mshadow::Stream< xpu > * get_stream() const
get mshadow stream from Context
Definition: base.h:364
mxnet::Context::dev_id
int32_t dev_id
device id we are going to run it on
Definition: base.h:101
mshadow::lapack_index_t
int lapack_index_t
Definition: base.h:344
mxnet::Context::kGPU
@ kGPU
Definition: base.h:94
io.h
defines serializable interface of dmlc
mxnet::Context::CPUPinned
static Context CPUPinned(int32_t dev_id=-1)
MSHADOW_CATCH_ERROR
#define MSHADOW_CATCH_ERROR(func)
Run function and catch error, log unknown error.
Definition: base.h:278
mxnet::GPUAuxStream::GetStream
mshadow::Stream< gpu > * GetStream()
Getter for created auxiliary stream.
Definition: base.h:280
mxnet::Context
Context information about the execution environment.
Definition: base.h:90
dmlc::Stream::Read
virtual size_t Read(void *ptr, size_t size)=0
reads data from a stream
mxnet::cpu
mshadow::cpu cpu
mxnet cpu
Definition: base.h:77
mshadow::default_real_t
float default_real_t
float point type that will be used in default by mshadow
Definition: base.h:348
mxnet::Context::Save
void Save(dmlc::Stream *strm) const
save the content into binary stream
Definition: base.h:147
mshadow::Stream< gpu >::dnn_handle_ownership_
HandleState dnn_handle_ownership_
cudnn handle ownership
Definition: stream_gpu-inl.h:64
mxnet::index_t
mshadow::index_t index_t
index type usually use unsigned
Definition: base.h:81
mxnet::Context::real_dev_id
int real_dev_id() const
Returns dev_id for kGPU and kCPUPinned, 0 otherwise.
Definition: base.h:116
std
Definition: optional.h:251
type_traits.h
type traits information header
mxnet::RunContext::get_gpu_aux_stream
SyncedGPUAuxStream get_gpu_aux_stream() const
get an RAII object that transparently handles the syncing of the auxiliary stream.
Definition: base.h:372
mxnet::Context::FromString
static Context FromString(const std::string &str)
mxnet::Context::CPU
static Context CPU(int32_t dev_id=0)
mxnet::RunContext::stream
void * stream
the stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:349
mxnet::RunContext::ctx
Context ctx
base Context
Definition: base.h:345
mxnet::Context::operator!=
bool operator!=(const Context &b) const
check if current context not equals another one
Definition: base.h:140
mxnet::Context::Create
static Context Create(DeviceType dev_type, int32_t dev_id=-1)
Create a new context.
tuple.h
Data structure Tuple and TShape to store dynamic sized shapes.
mxnet::Context::Context
Context()
default constructor
Definition: base.h:103
mxnet::GPUAuxStream::PreAuxStreamUseSync
void PreAuxStreamUseSync()
Makes future aux stream work wait on the completion of existing primary stream work.
Definition: base.h:266
mxnet::gpu
mshadow::gpu gpu
mxnet gpu
Definition: base.h:79
mxnet::GPUAuxStream::PostAuxStreamUseSync
void PostAuxStreamUseSync()
Makes future primary stream work wait on the completion of existing aux stream work.
Definition: base.h:274
mshadow::gpu::kDevMask
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:50
mxnet::RunContext::get_ctx
const Context & get_ctx() const
get the base Context from RunContext
Definition: base.h:377
mxnet::Op
nnvm::Op Op
operator structure from NNVM
Definition: base.h:87
mxnet::RunContext::aux_stream
void * aux_stream
the auxiliary stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:353
nnvm::Op
Operator structure.
Definition: op.h:105
dmlc::Stream
interface of stream I/O for serialization
Definition: io.h:30
mxnet::Context::kMaxDevID
static const int32_t kMaxDevID
the maximal device index
Definition: base.h:166
mxnet::real_t
mshadow::default_real_t real_t
data type that will be used to store ndarray
Definition: base.h:85
mxnet::RunContext::event_pool
void * event_pool
pointer to the cuda event pool used by the dependency engine
Definition: base.h:357
mxnet::cpp::DeviceType
DeviceType
Definition: ndarray.h:40
mxnet::SyncedGPUAuxStream::SyncedGPUAuxStream
SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream)
constructor.
Definition: base.h:314