mxnet
stream_gpu-inl.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MSHADOW_STREAM_GPU_INL_H_
26 #define MSHADOW_STREAM_GPU_INL_H_
27 #include <memory>
28 #include "./base.h"
29 #include "./tensor.h"
30 #include "dmlc/logging.h"
31 
32 namespace mshadow {
33 #if MSHADOW_USE_CUDA == 1
34 // Stream alocation
35 // actual implementation of GPU stream in CUDA
36 template<>
37 struct Stream<gpu> {
39  enum HandleState {
40  NoHandle = 0,
41  OwnHandle = 1,
42  };
44  cudaStream_t stream_;
46  cublasHandle_t blas_handle_;
48  #if MSHADOW_USE_CUSOLVER == 1
49  cusolverDnHandle_t solver_handle_;
50  #endif
51 
52  #if MSHADOW_USE_CUDNN == 1
53  cudnnHandle_t dnn_handle_;
54  #endif
55 
56  #if MSHADOW_USE_CUTENSOR== 1
57  cutensorHandle_t cutensor_handle_;
58  #endif
59 
67  void* cutensor_cachelines_ = nullptr;
69  cudaDeviceProp prop;
71  int dev_id;
72 
73  Stream(void)
74  : stream_(0)
75  , blas_handle_(0)
76 #if MSHADOW_USE_CUDNN == 1
77  , dnn_handle_(0)
78 #endif
79  //, cutensor_handle_()
80  , blas_handle_ownership_(NoHandle)
81  , solver_handle_ownership_(NoHandle)
82  , dnn_handle_ownership_(NoHandle)
83  , cutensor_handle_ownership_(NoHandle)
84  , cutensor_cachelines_(nullptr){}
89  inline void Wait(void) {
90  MSHADOW_CUDA_CALL(cudaStreamSynchronize(stream_));
91  }
96  inline bool CheckIdle(void) {
97  cudaError_t err = cudaStreamQuery(stream_);
98  if (err == cudaSuccess) return true;
99  if (err == cudaErrorNotReady) return false;
100  LOG(FATAL) << cudaGetErrorString(err);
101  return false;
102  }
107  inline static cudaStream_t GetStream(Stream<gpu> *stream) {
108  if (stream == NULL) {
109 #if MSHADOW_FORCE_STREAM
110  LOG(FATAL) << "Default GPU stream was used when MSHADOW_FORCE_STREAM was on";
111 #endif
112  return 0;
113  } else {
114  return stream->stream_;
115  }
116  }
121  inline static cublasHandle_t GetBlasHandle(Stream<gpu> *stream) {
122  if (stream == NULL) {
123  return 0;
124  } else {
125  CHECK_NE(stream->blas_handle_ownership_, NoHandle)
126  << "No handle exist in source stream";
127  return stream->blas_handle_;
128  }
129  }
131  inline void DestroyBlasHandle() {
132  if (blas_handle_ownership_ == OwnHandle) {
133  cublasStatus_t err = cublasDestroy(blas_handle_);
134  blas_handle_ownership_ = NoHandle;
135  CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Destory cublas handle failed";
136  }
137  }
139  inline void CreateBlasHandle() {
140  this->DestroyBlasHandle();
141  cublasStatus_t err = cublasCreate(&blas_handle_);
142  blas_handle_ownership_ = OwnHandle;
143  CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Create cublas handle failed";
144  err = cublasSetStream(blas_handle_, stream_);
145  CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Setting cublas stream failed";
146  }
147 #if MSHADOW_USE_CUSOLVER == 1
148  inline static cusolverDnHandle_t GetSolverHandle(Stream<gpu> *stream) {
149  if (stream == NULL) {
150  return 0;
151  } else {
152  CHECK_NE(stream->solver_handle_ownership_, NoHandle) << "No handle exist in source stream";
153  return stream->solver_handle_;
154  }
155  }
156 #endif
157  inline void DestroySolverHandle() {
158 #if MSHADOW_USE_CUSOLVER == 1
159  if (solver_handle_ownership_ == OwnHandle) {
160  cusolverStatus_t err = cusolverDnDestroy(solver_handle_);
161  CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Destory cusolver handle failed";
162  }
163 #endif
164  }
165  inline void CreateSolverHandle() {
166 #if MSHADOW_USE_CUSOLVER == 1
167  this->DestroySolverHandle();
168  cusolverStatus_t err = cusolverDnCreate(&solver_handle_);
169  CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Create cusolver handle failed";
170  err = cusolverDnSetStream(solver_handle_, stream_);
171  CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Setting cusolver stream failed";
172  this->solver_handle_ownership_ = OwnHandle;
173 #endif
174  }
175 // #if MSHADOW_USE_CUDNN && defined(__CUDACC__)
176 #if MSHADOW_USE_CUDNN == 1
177  inline static cudnnHandle_t GetDnnHandle(Stream<gpu> *stream) {
178  if (stream == NULL) {
179  return 0;
180  } else {
181  CHECK_NE(stream->dnn_handle_ownership_, NoHandle) << "No handle exist in source stream";
182  return stream->dnn_handle_;
183  }
184  }
185 #endif
186  inline void DestroyDnnHandle() {
187 // #if MSHADOW_USE_CUDNN && defined(__CUDACC__)
188 #if MSHADOW_USE_CUDNN == 1
189  if (dnn_handle_ownership_ == OwnHandle) {
190  cudnnStatus_t err = cudnnDestroy(dnn_handle_);
191  this->dnn_handle_ownership_ = NoHandle;
192  CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
193  }
194 #endif
195  }
196  inline void CreateDnnHandle() {
197 // #if MSHADOW_USE_CUDNN == 1 && defined(__CUDACC__)
198 #if MSHADOW_USE_CUDNN == 1
199  this->DestroyDnnHandle();
200  cudnnStatus_t err = cudnnCreate(&dnn_handle_);
201  CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
202  // At this point, we have the resource which may need to be freed
203  this->dnn_handle_ownership_ = OwnHandle;
204  err = cudnnSetStream(dnn_handle_, stream_);
205  CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
206 #endif
207  }
208  inline void DestroyCuTensorHandle() {
209 #if MSHADOW_USE_CUTENSOR == 1
210  if (cutensor_handle_ownership_ == OwnHandle) {
211  // not destroy method available
212  if (cutensor_cachelines_ != nullptr) {
213  cutensorStatus_t err;
214  const char* cacheFilename = getenv("MXNET_CUTENSOR_CACHEFILE");
215  if (cacheFilename != nullptr) {
216  err = cutensorHandleWriteCacheToFile(&cutensor_handle_, cacheFilename);
217  CHECK_EQ(err, CUTENSOR_STATUS_SUCCESS) << cutensorGetErrorString(err);
218  }
219  err = cutensorHandleDetachPlanCachelines(&cutensor_handle_);
220  CHECK_EQ(err, CUTENSOR_STATUS_SUCCESS) << cutensorGetErrorString(err);
221  free(cutensor_cachelines_);
222  cutensor_cachelines_ = nullptr;
223  }
224  this->cutensor_handle_ownership_ = NoHandle;
225  }
226 #endif
227  }
228  inline void CreateCuTensorHandle() {
229 #if MSHADOW_USE_CUTENSOR == 1
230  this->DestroyCuTensorHandle();
231  cutensorStatus_t err = cutensorInit(&cutensor_handle_);
232  CHECK_EQ(err, CUTENSOR_STATUS_SUCCESS) << cutensorGetErrorString(err);
233  const char* cacheFilename = getenv("MXNET_CUTENSOR_CACHEFILE");
234  if (cacheFilename != nullptr) {
235  constexpr int32_t numCachelines = 1024;
236  size_t sizeCache = numCachelines * sizeof(cutensorPlanCacheline_t);
237  cutensor_cachelines_ = malloc(sizeCache);
238  err = cutensorHandleAttachPlanCachelines(&cutensor_handle_, (cutensorPlanCacheline_t*) cutensor_cachelines_, numCachelines);
239  CHECK_EQ(err, CUTENSOR_STATUS_SUCCESS) << cutensorGetErrorString(err);
240 
241  uint32_t numCachelinesRead = 0;
242  cutensorStatus_t status = cutensorHandleReadCacheFromFile(&cutensor_handle_, cacheFilename, &numCachelinesRead);
243  if (status == CUTENSOR_STATUS_IO_ERROR) {
244  printf("File (%s) doesn't seem to exist.\n", cacheFilename);
245  } else if (status == CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE) {
246  printf("Cannot read cache: Please attach at least %d cachelines to the handle.\n", numCachelinesRead);
247  }
248  }
249  // At this point, we have the resource which may need to be freed
250  this->cutensor_handle_ownership_ = OwnHandle;
251 #endif
252  }
253 };
254 template<>
255 inline void DeleteStream<gpu>(Stream<gpu> *stream) {
256  if (stream) {
257  stream->DestroyCuTensorHandle();
258  MSHADOW_CUDA_CALL(cudaStreamDestroy(stream->stream_));
259  stream->DestroyBlasHandle();
260  stream->DestroySolverHandle();
261  stream->DestroyDnnHandle();
262  delete stream;
263  }
264 }
265 template<>
266 inline Stream<gpu> *NewStream<gpu>(bool create_blas_handle,
267  bool create_dnn_handle,
268  int dev_id) {
269  // RAII on Cuda exception
270  struct StreamDeleter { void operator()(Stream<gpu> *ptr) const { DeleteStream<gpu>(ptr); } };
271  std::unique_ptr<Stream<gpu>, StreamDeleter> st(new Stream<gpu>());
272  MSHADOW_CUDA_CALL(cudaStreamCreate(&st->stream_));
273  if (create_blas_handle) {
274  st->CreateBlasHandle();
275  st->CreateSolverHandle();
276  }
277  if (create_dnn_handle) {
278  st->CreateDnnHandle();
279  }
280 #if MSHADOW_USE_CUTENSOR == 1
281  st->CreateCuTensorHandle();
282 #endif
283  st->dev_id = dev_id;
284  if (dev_id != -1) {
285  MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&st->prop, dev_id));
286  }
287  return st.release();
288 }
289 #endif
290 } // namespace mshadow
291 #endif // MSHADOW_STREAM_GPU_INL_H_
mshadow::Stream< gpu >::solver_handle_ownership_
HandleState solver_handle_ownership_
cusolver handle ownership
Definition: stream_gpu-inl.h:62
mshadow::Stream
computaion stream structure, used for asynchronous computations
Definition: tensor.h:488
mshadow::Stream< gpu >::DestroyBlasHandle
void DestroyBlasHandle()
Destory cublas handle if own it.
Definition: stream_gpu-inl.h:131
mshadow::DeleteStream< gpu >
void DeleteStream< gpu >(Stream< gpu > *stream)
Definition: stream_gpu-inl.h:255
mshadow::Stream< gpu >::DestroySolverHandle
void DestroySolverHandle()
Definition: stream_gpu-inl.h:157
mshadow::Stream< gpu >::DestroyDnnHandle
void DestroyDnnHandle()
Definition: stream_gpu-inl.h:186
MSHADOW_USE_CUDNN
#define MSHADOW_USE_CUDNN
use CUDNN support, must ensure that the cudnn include path is correct
Definition: base.h:122
mshadow::Stream< gpu >::CreateSolverHandle
void CreateSolverHandle()
Definition: stream_gpu-inl.h:165
mshadow::Stream< gpu >::stream_
cudaStream_t stream_
cudaStream
Definition: stream_gpu-inl.h:44
MSHADOW_CUDA_CALL
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:264
mshadow::Stream< gpu >::CreateCuTensorHandle
void CreateCuTensorHandle()
Definition: stream_gpu-inl.h:228
mshadow::Stream< gpu >::HandleState
HandleState
handle state
Definition: stream_gpu-inl.h:39
mshadow::Stream< gpu >::blas_handle_
cublasHandle_t blas_handle_
cublas handle
Definition: stream_gpu-inl.h:46
mshadow::Stream< gpu >::dev_id
int dev_id
dev id
Definition: stream_gpu-inl.h:71
mshadow::Stream< gpu >::CreateBlasHandle
void CreateBlasHandle()
Destory original blas handle and create a new one.
Definition: stream_gpu-inl.h:139
mshadow::gpu
device name GPU
Definition: tensor.h:46
mshadow::Stream< gpu >::cutensor_handle_ownership_
HandleState cutensor_handle_ownership_
cutensor handle ownership
Definition: stream_gpu-inl.h:66
tensor.h
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
mshadow::Stream< gpu >::GetBlasHandle
static cublasHandle_t GetBlasHandle(Stream< gpu > *stream)
return actual cublasHandle
Definition: stream_gpu-inl.h:121
mshadow::Stream< gpu >
Definition: stream_gpu-inl.h:37
mshadow::Stream< gpu >::Stream
Stream(void)
Definition: stream_gpu-inl.h:73
mshadow::NewStream< gpu >
Stream< gpu > * NewStream< gpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: stream_gpu-inl.h:266
mshadow::Stream< gpu >::DestroyCuTensorHandle
void DestroyCuTensorHandle()
Definition: stream_gpu-inl.h:208
mshadow::Stream< gpu >::GetStream
static cudaStream_t GetStream(Stream< gpu > *stream)
returns actual cudaStream_t given an input GPU stream pointer
Definition: stream_gpu-inl.h:107
mshadow::Stream< gpu >::blas_handle_ownership_
HandleState blas_handle_ownership_
cudnn handle
Definition: stream_gpu-inl.h:60
mshadow::Stream< gpu >::GetSolverHandle
static cusolverDnHandle_t GetSolverHandle(Stream< gpu > *stream)
Definition: stream_gpu-inl.h:148
mshadow::Stream< gpu >::prop
cudaDeviceProp prop
cudaDeviceProp
Definition: stream_gpu-inl.h:69
mshadow::Stream< gpu >::Wait
void Wait(void)
wait for all the computation associated with this stream to complete
Definition: stream_gpu-inl.h:89
mshadow
overloaded + operator between half_t and bf16_t
Definition: base.h:319
mshadow::Stream< gpu >::CreateDnnHandle
void CreateDnnHandle()
Definition: stream_gpu-inl.h:196
mshadow::Stream< gpu >::dnn_handle_ownership_
HandleState dnn_handle_ownership_
cudnn handle ownership
Definition: stream_gpu-inl.h:64
mshadow::Stream< gpu >::solver_handle_
cusolverDnHandle_t solver_handle_
cusolver handle
Definition: stream_gpu-inl.h:49
base.h
definitions of base types, operators, macros functions
mshadow::Stream< gpu >::CheckIdle
bool CheckIdle(void)
query whether the the stream is idle
Definition: stream_gpu-inl.h:96