// cudamatrix/cu-matrix.cc // Copyright 2009-2012 Karel Vesely, Lucas Ondel // 2013 Ehsan Variani // 2013 Johns Hopkins University (author: Daniel Povey) // 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013 Johns Hopkins University (author: Guoguo Chen) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #if HAVE_CUDA == 1 #include #include #endif #include "util/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" #include "cudamatrix/cu-randkernels.h" #include "cudamatrix/cu-choleskykernels.h" #include "cudamatrix/cu-array.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-tp-matrix.h" #include "cudamatrix/cu-block-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { template void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type) { // This code does not currently support the other resize_type options. KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined); if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0); if (this->num_rows_ == rows && this->num_cols_ == cols) { if (resize_type == kSetZero) this->SetZero(); return; } if (this->num_rows_ != 0) this->Destroy(); if (rows == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; MatrixIndexT row_bytes = cols * sizeof(Real); size_t pitch; this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( row_bytes, rows, &pitch)); this->num_rows_ = rows; this->num_cols_ = cols; this->stride_ = pitch / sizeof(Real); if (resize_type == kSetZero) this->SetZero(); CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed()); } else #endif { // Let the initializer of Matrix handle the allocation, // and then just do Swap which will switch the pointers. // This wastes a few instructions but is simple to code. Matrix mat(rows, cols, resize_type); this->Swap(&mat); } } template void CuMatrix::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->data_ != NULL) { Timer tim; CuDevice::Instantiate().Free(this->data_); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } } else #endif { if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_); } this->data_ = NULL; this->num_rows_ = 0; this->num_cols_ = 0; this->stride_ = 0; } template void CuMatrix::Swap(CuMatrix *mat) { std::swap(mat->data_, this->data_); std::swap(mat->num_cols_, this->num_cols_); std::swap(mat->num_rows_, this->num_rows_); std::swap(mat->stride_, this->stride_); } template void CuMatrix::Swap(Matrix *mat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->num_rows_ == 0) { if (mat->num_rows_ != 0) { // *this is empty, but mat is nonempty. this->Resize(mat->num_rows_, mat->num_cols_, kUndefined); this->CopyFromMat(*mat); mat->Resize(0, 0); } // else both are empty. } else { // *this is nonempty. if (mat->num_rows_ != 0) { // Both *this and *mat are nonempty. Recurse to simpler cases. // this could be done more efficiently in the case where // the size does not change. Matrix temp; this->Swap(&temp); // now temp is full, *this is empty. mat->Swap(&temp); // now mat has data from *this, temp has // data from mat. this->Swap(&temp); // copy data in mat to *this, which is now empty. } else { // *this is full but *mat is empty. mat->Resize(this->num_rows_, this->num_cols_, kUndefined); this->CopyToMat(mat); this->Destroy(); } } } else #endif { std::swap(mat->data_, this->data_); std::swap(mat->num_cols_, this->num_cols_); std::swap(mat->num_rows_, this->num_rows_); std::swap(mat->stride_, this->stride_); } } template void CuMatrixBase::CopyFromBlock(const CuBlockMatrix &B, MatrixTransposeType trans) { this->SetZero(); if (trans == kNoTrans) { KALDI_ASSERT(NumRows() == B.NumRows() && NumCols() == B.NumCols()); int32 row_offset = 0, col_offset = 0; for (int32 b = 0; b < B.NumBlocks(); b++) { const CuMatrixBase &block = B.Block(b); int32 num_rows = block.NumRows(), num_cols = block.NumCols(); CuSubMatrix this_block(*this, row_offset, num_rows, col_offset, num_cols); this_block.CopyFromMat(block); row_offset += num_rows; col_offset += num_cols; } KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); } else { KALDI_ASSERT(NumRows() == B.NumCols() && NumCols() == B.NumRows()); int32 row_offset = 0, col_offset = 0; for (int32 b = 0; b < B.NumBlocks(); b++) { const CuMatrixBase &block = B.Block(b); int32 num_rows = block.NumCols(), num_cols = block.NumRows(); CuSubMatrix this_block(*this, row_offset, num_rows, col_offset, num_cols); this_block.CopyFromMat(block, kTrans); row_offset += num_rows; col_offset += num_cols; } KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); } } template CuMatrix::CuMatrix(const CuBlockMatrix &B, MatrixTransposeType trans): CuMatrixBase() { if (trans == kNoTrans) { Resize(B.NumRows(), B.NumCols(), kUndefined); this->CopyFromBlock(B); } else { Resize(B.NumCols(), B.NumRows(), kUndefined); this->CopyFromBlock(B, kTrans); } } template template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (Trans == kNoTrans) { KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_); } else { KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_); } if (M.num_rows_ == 0) return; // Nothing to do. Timer tim; if (sizeof(Real) == sizeof(OtherReal) && Trans == kNoTrans ) { MatrixIndexT dst_pitch = stride_ * sizeof(Real); MatrixIndexT src_pitch = M.Stride() * sizeof(Real); MatrixIndexT width = M.NumCols() * sizeof(Real); CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch, width, M.num_rows_, cudaMemcpyDeviceToDevice)); } else { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // We are making this kernel "newer-style, with x corresponding to // row dimension and y to column dimension. dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); if (Trans == kNoTrans) { cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); } else { cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); } } CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim.Elapsed()); } else #endif { Mat().CopyFromMat(M.Mat(), Trans); } } // Instantiate the template above. template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans); template template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, MatrixTransposeType Trans) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; int dimGrid = 1; int dimBlock = num_rows_; SetZero(); if (Trans == kNoTrans) { cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim()); } else { cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim()); } CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyFromTp(M.Mat(), Trans); } } // instantiate the template above. template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, MatrixTransposeType Trans); /* // template instantiations. template void CuMatrixBase::CopyFromMat(const CuMatrixBase & M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase & M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase & M, MatrixTransposeType Trans); template void CuMatrixBase::CopyFromMat(const CuMatrixBase & M, MatrixTransposeType Trans); */ template void CuMatrixBase::CopyFromMat(const MatrixBase &src, MatrixTransposeType trans) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (trans == kNoTrans) { KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_); Timer tim; MatrixIndexT dst_pitch = stride_*sizeof(Real); MatrixIndexT src_pitch = src.Stride()*sizeof(Real); MatrixIndexT width = src.NumCols()*sizeof(Real); CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyHostToDevice)); CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)",tim.Elapsed()); } else { CuMatrix trans_mat(src); // Do the transpose on the GPU board. this->CopyFromMat(trans_mat, kTrans); } } else #endif { Mat().CopyFromMat(src, trans); } } template template void CuMatrixBase::CopyFromMat(const MatrixBase &src, MatrixTransposeType trans) { CuMatrix temp(src); this->CopyFromMat(temp, trans); } template void CuMatrixBase::CopyFromSp(const CuSpMatrix &M) { KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; int dimBlock(CU2DBLOCK); int dimGrid(n_blocks(NumRows(),CU2DBLOCK)); cuda_copy_from_sp(dimGrid, dimBlock, M.Data(), data_, num_rows_, Dim()); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromSp",tim.Elapsed()); } else #endif { Mat().CopyFromSp(M.Mat()); } } template CuMatrix::CuMatrix(const CuMatrix &other, MatrixTransposeType trans) { if (trans == kNoTrans) this->Resize(other.NumRows(), other.NumCols(), kUndefined); else this->Resize(other.NumCols(), other.NumRows(), kUndefined); this->CopyFromMat(other, trans); } template CuMatrix::CuMatrix(const CuMatrixBase &other, MatrixTransposeType trans) { if (trans == kNoTrans) this->Resize(other.NumRows(), other.NumCols(), kUndefined); else this->Resize(other.NumCols(), other.NumRows(), kUndefined); this->CopyFromMat(other, trans); } template template CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans) { if (trans == kNoTrans) this->Resize(other.NumRows(), other.NumCols(), kUndefined); else this->Resize(other.NumCols(), other.NumRows(), kUndefined); this->CopyFromMat(other, trans); } // Instantiate the template above. template CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); template CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); template CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); template CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); template template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) { CuMatrix this_trans(*this, trans); this_trans.CopyToMat(dst, kNoTrans); } else { KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols()); Timer tim; MatrixIndexT src_pitch = stride_*sizeof(Real); MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real); MatrixIndexT width = NumCols()*sizeof(Real); CU_SAFE_CALL(cudaMemcpy2D(dst->Data(), dst_pitch, this->data_, src_pitch, width, this->num_rows_, cudaMemcpyDeviceToHost)); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H",tim.Elapsed()); } } else #endif { dst->CopyFromMat(Mat(), trans); } } // instantiate the template above. template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const; template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const; template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const; template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const; template void CuMatrix::Read(std::istream &is, bool binary) { Matrix temp; temp.Read(is, binary); Destroy(); Swap(&temp); } template void CuMatrixBase::Write(std::ostream &os, bool binary) const { Matrix temp(this->num_rows_, this->num_cols_, kUndefined); this->CopyToMat(&temp); temp.Write(os, binary); } template void CuMatrixBase::SetZero() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0, num_cols_ * sizeof(Real), num_rows_ )); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim.Elapsed()); } else #endif { Mat().SetZero(); } } /* * Methods wrapping the ANSI-C CUDA kernels */ template void CuMatrixBase::Set(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_set_const(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Set(value); } } // set zero the upper diagonal // no cpu implementation yet. Check with Dan. template void CuMatrixBase::SetZeroUpperDiag() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } #endif } template void CuMatrixBase::Add(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_add(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Add(value); } } template void CuMatrixBase::AddToDiag(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; // We'll create a fake matrix with "num_diag" rows, one // columnn, and a stride of "this_stride". The y-value of // the grid/blocks corresponds to the row, in this kernel. MatrixIndexT num_diag = std::min(num_rows_, num_cols_), this_stride = stride_ + 1; dim3 dimBlock(1, CU1DBLOCK); dim3 dimGrid(1, n_blocks(num_diag, CU1DBLOCK)); ::MatrixDim d = { num_diag, 1, this_stride }; cuda_add(dimGrid, dimBlock, data_, value, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().AddToDiag(value); } } template bool CuMatrixBase::IsUnit(Real tol) const { // want to return: //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.: //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows() // trace((*this - I)(*this - I)) <= tol * NumRows() // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows() // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows() KALDI_ASSERT(this->NumRows() == this->NumCols()); return (TraceMatMat(*this, *this, kTrans) + this->NumRows() - 2.0 * this->Trace() <= tol * this->NumRows()); } template void CuMatrixBase::Scale(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_scale(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Scale(value); } } template void CuMatrixBase::ApplyLog() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_apply_log(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyLog(); } } template void CuMatrixBase::MulElements(const CuMatrixBase& A) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(num_cols_ == A.NumCols()); KALDI_ASSERT(num_rows_ == A.NumRows()); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().MulElements(A.Mat()); } } template void CuMatrixBase::Max(const CuMatrixBase& A) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(num_cols_ == A.NumCols()); KALDI_ASSERT(num_rows_ == A.NumRows()); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Max(A.Mat()); } } template void CuMatrixBase::MulColsVec(const CuVectorBase &scale) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(scale.Dim() == NumCols()); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().MulColsVec(scale.Vec()); } } template void CuMatrixBase::MulRowsVec(const CuVectorBase &scale) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(scale.Dim() == NumRows()); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().MulRowsVec(scale.Vec()); } } template void CuMatrixBase::MulRowsGroupMat(const CuMatrixBase &src) { KALDI_ASSERT(src.NumCols() > 0); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; int group_size = this->NumCols() / src.NumCols(); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().MulRowsGroupMat(src.Mat()); } } template void CuMatrixBase::GroupPnormDeriv(const CuMatrixBase &src1, const CuMatrixBase &src2, Real power) { KALDI_ASSERT(src2.NumCols() > 0); int group_size = this->NumCols() / src2.NumCols(); KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(), src2.Data(), Dim(), src2.Stride(), group_size, power); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().GroupPnormDeriv(src1.Mat(), src2.Mat(), power); } } template void CuMatrixBase::DivRowsVec(const CuVectorBase &div) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(div.Dim() == NumRows()); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Vector temp(div.Vec()); // will copy. temp.InvertElements(); Mat().MulRowsVec(temp); } } template void CuMatrixBase::InvertElements() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_invert_elements(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().InvertElements(); } } template void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(num_rows_ == A.num_rows_ && num_cols_ == A.num_cols_); if (num_rows_ == 0) return; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_add_mat(dimGrid, dimBlock, alpha, A.data_, beta, data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Scale(beta); Mat().AddMat(alpha, A.Mat()); } } template void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, const CuMatrixBase &B, const CuMatrixBase &C) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(num_rows_ == A.num_rows_ && num_cols_ == A.num_cols_); KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_); KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_); if (num_rows_ == 0) return; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().AddMatMatDivMat(A.Mat(), B.Mat(), C.Mat()); } } template void CuMatrixBase::AddVecToCols(Real alpha, const CuVectorBase &col, Real beta) { if (col.Dim() != NumRows()) { KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim(); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { if (beta != 1.0) Mat().Scale(beta); Mat().AddVecToCols(alpha, col.Vec()); } } template void CuMatrixBase::AddVecToRows(Real alpha, const CuVectorBase &row, Real beta) { if (row.Dim() != NumCols()) { KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim(); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { if (beta != 1.0) Mat().Scale(beta); Mat().AddVecToRows(alpha, row.Vec()); } } /* * Method wrapping the CUBLAS function GEMM */ template void CuMatrixBase::AddMatMat( Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, const CuMatrixBase &B, MatrixTransposeType transB, Real beta) { // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping? // keep trans..., just swap A&B matrices: A->B B->A MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols()); MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows()); MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows()); MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols()); KALDI_ASSERT(m == NumCols()); KALDI_ASSERT(n == NumRows()); KALDI_ASSERT(k == k1); if (m == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; cublas_gemm((transB==kTrans?'T':'N'), (transA==kTrans?'T':'N'), m, n, k, alpha, B.data_, B.Stride(), A.data_, A.Stride(), beta, data_, Stride()); CU_SAFE_CALL(cublasGetError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().AddMatMat(alpha, A.Mat(), transA, B.Mat(), transB, beta); } } template void CuMatrixBase::SymAddMat2( Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, Real beta) { KALDI_ASSERT(num_rows_ == num_cols_ && ((transA == kNoTrans && A.num_rows_ == num_rows_) || (transA == kTrans && A.num_cols_ == num_cols_))); if (num_rows_ == 0) return; KALDI_ASSERT(A.data_ != data_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; char trans = (transA == kTrans ? 'N' : 'T'); MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_); cublas_syrk('U', trans, num_rows_, A_other_dim, alpha, A.Data(), A.Stride(), beta, this->data_, this->stride_); CU_SAFE_CALL(cublasGetError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().SymAddMat2(alpha, A.Mat(), transA, beta); } } template void CuMatrixBase::AddDiagVecMat( const Real alpha, CuVectorBase &v, const CuMatrixBase &M, MatrixTransposeType transM, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (transM == kNoTrans) { KALDI_ASSERT(SameDim(*this, M)); } else { KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows()); } KALDI_ASSERT(v.Dim() == this->NumRows()); Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // Caution, this dimGrid is not the same way around as much of the other // code: going forward, I want to use the (rows, cols) order. dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; if (transM == kTrans) std::swap(M_row_stride, M_col_stride); cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(), v.Data(), M.Data(), M_row_stride, M_col_stride, beta); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta); } } template void CuMatrixBase::Sigmoid(const CuMatrixBase &src) { //KALDI_ASSERT(SameDimAndStride(*this, src)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Sigmoid(src.Mat()); } } template void CuMatrixBase::SoftHinge(const CuMatrixBase &src) { //KALDI_ASSERT(SameDimAndStride(*this, src)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().SoftHinge(src.Mat()); } } template void CuMatrixBase::GroupPnorm(const CuMatrixBase &src, Real power) { int group_size = src.NumCols() / this->NumCols(); KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size && this->NumRows() == src.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size, power); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().GroupPnorm(src.Mat(), power); } } /* Think of sv_labels as a Matrix, denoting the "correct" label of each frame to each phone-state; it's very likely to contain a LOT of zeros tot_weight = the sum of ALL element in matrix sv_labels tot_objf = the sum of the product of (each element in matrix sv_labels) and (the log of its counterpart in matrix output) an element in "this" matrix = (the element in matrix sv_labels) divided by (the element in matrix output) */ template void CuMatrix::CompObjfAndDeriv(const std::vector >& sv_labels, const CuMatrix &output, Real *tot_objf, Real* tot_weight) { { // check the input. typedef typename std::vector >::const_iterator Iter; MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; for (Iter iter = sv_labels.begin(); iter != sv_labels.end(); ++iter) { KALDI_ASSERT(iter->row < num_rows && iter->row >= 0 && iter->column < num_cols && iter->column >= 0); } } # if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (sv_labels.empty()) { KALDI_WARN << "Empty supervision labels"; *tot_objf = 0.0; *tot_weight = 0.0; return; } void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement)); CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement), cudaMemcpyHostToDevice)); Timer tim; CuVector tmp(2, kUndefined); //tmp(0) = 0; tmp(1) = 0; int dimBlock(CU1DBLOCK); int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); cuda_comp_obj_deriv(dimGrid, dimBlock, (MatrixElement*)addr, sv_labels.size(), output.Data(), output.Dim(), this->Data(), this->Dim(), tmp.Data()); Vector tmp_cpu(tmp); *tot_objf = tmp_cpu(0); *tot_weight = tmp_cpu(1); CuDevice::Instantiate().Free(addr); CuDevice::Instantiate().AccuProfile("Comp_Obj_Deriv", tim.Elapsed()); } else #endif { for(int32 i = 0; i= 0 && label < nnet_.OutputDim()); Real this_prob = output(m, label); KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer. *tot_objf += weight * log(this_prob); *tot_weight += weight; (*this)(m, label) += weight / this_prob; } } } template // Y->this, X->src void CuMatrixBase::ApplySoftMaxPerRow(const CuMatrixBase &src) { //KALDI_ASSERT(SameDimAndStride(*this, src)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_; size_t dimGrid = src.num_rows_; cuda_softmax_reduce(dimGrid, dimBlock, data_, src.data_, Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { MatrixBase &mat(this->Mat()); mat.CopyFromMat(src.Mat()); for(MatrixIndexT r = 0; r < mat.NumRows(); r++) { mat.Row(r).ApplySoftMax(); } } } // DiffSigmoid(Ein, Y, Eout) -> Eout.DiffSigmoid(Y, Ein). template // Eout -> *this, Ein -> diff, Y -> value void CuMatrixBase::DiffSigmoid(const CuMatrixBase &value, const CuMatrixBase &diff) { //KALDI_ASSERT(SameDimAndStride(*this, value) && SameDimAndStride(*this, diff)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().DiffSigmoid(value.Mat(), diff.Mat()); } } template void CuMatrixBase::Tanh(const CuMatrixBase &src) { //KALDI_ASSERT(SameDimAndStride(*this, src)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().Tanh(src.Mat()); } } template // Ein -> diff, Y -> value void CuMatrixBase::DiffTanh(const CuMatrixBase &value, const CuMatrixBase &diff) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().DiffTanh(value.Mat(), diff.Mat()); } } template void CuMatrixBase::FindRowMaxId(CuArray *id) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; // initialize the vectors CuVector max(num_rows_); max.Set(-1e21); id->Resize(num_rows_); id->Set(-1); MatrixDim d=Dim();// only stride will be used! // process per 256 column blocks for(int32 block=0; (block+1)*256 <= num_cols_; block++) { dim3 dimBlock(256, 1); dim3 dimGrid(1, num_rows_); int32 offset=block*256; cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, max.data_, id->Data(), offset, d); } // process the remainder int32 div = num_cols_ / 256; int32 mod = num_cols_ % 256; if (mod != 0) { dim3 dimBlock(mod, 1); dim3 dimGrid(1, num_rows_); int32 offset=div*256; cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, max.data_, id->Data(), offset, d); } // now we have the indices! CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // allocate index buffer id->Resize(num_rows_); id->Set(-1); // find maxima MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; for(MatrixIndexT r = 0; r < num_rows; r++) { Real max = -1e21; int32 max_id = -1; const Real *row_data = Mat().RowData(r); for(MatrixIndexT c = 0; c < num_cols; c++) { if (max < row_data[c]) { max = row_data[c]; max_id = c; } } id->Data()[r] = max_id; } } } template void CuMatrixBase::DiffXent(const CuArray &tgt, CuVector *log_post_tgt) { KALDI_ASSERT(tgt.Dim() == num_rows_); log_post_tgt->Resize(tgt.Dim()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(1, CU2DBLOCK*8); dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8)); cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_, log_post_tgt->data_, Dim()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { MatrixIndexT num_rows = num_rows_; for(int32 r = 0; r < num_rows; r++) { int32 col_tgt = tgt.Data()[r]; Real &value = Mat()(r, col_tgt); log_post_tgt->Vec()(r) = log(value); value -= 1.0; } } } /// This method may be only called for symmetric matrices. template void CuMatrixBase::Cholesky() { KALDI_ASSERT(this->NumRows() == this->NumCols()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; int TILE_SIZE = 16; int n_blocks = (num_rows_ + TILE_SIZE - 1) / TILE_SIZE; dim3 threads(TILE_SIZE,TILE_SIZE); dim3 logrid; for (int i = n_blocks; i > 2; i--) { cuda_factorize_diagonal_block(data_, n_blocks-i, Dim()); cuda_strip_update(data_, n_blocks-i, i, Dim()); cuda_diag_update(data_, n_blocks-i, i, Dim()); cuda_lo_update(data_, n_blocks-i, n_blocks, i, Dim()); } if (n_blocks > 1) { cuda_factorize_diagonal_block(data_, n_blocks-2, Dim()); cuda_strip_update(data_, n_blocks-2, 2, Dim()); cuda_diag_update(data_, n_blocks-2, 2, Dim()); } cuda_factorize_diagonal_block(data_, n_blocks-1, Dim()); CU_SAFE_CALL(cudaGetLastError()); // set the upper diagonal equal to zero this->SetZeroUpperDiag(); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { SpMatrix sp(this->NumRows(), kUndefined); sp.CopyFromMat(this->Mat(), kTakeLower); TpMatrix tp(this->NumRows()); tp.Cholesky(sp); this->Mat().CopyFromTp(tp); } } template void CuMatrixBase::SymInvertPosDef() { KALDI_ASSERT(num_rows_ == num_cols_); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(NumRows(),CU1DBLOCK)); CuMatrix temp(num_rows_, num_rows_); Real value = 1.0; cuda_set_diag(dimGrid, dimBlock, temp.Data(), value, temp.Dim()); this->Cholesky(); { Timer tim; Real alpha = 1.0; cublas_trsm(num_rows_, num_rows_, alpha, data_, stride_, temp.Data(), temp.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuMatrixBase::InvertPSD(trsm)", tim.Elapsed()); } this->AddMatMat(1, temp, kTrans, temp, kNoTrans, 0); this->CopyLowerToUpper(); } else #endif { SpMatrix temp_sp(this->Mat(), kTakeLower); TpMatrix C(temp_sp.NumRows(), kUndefined); C.Cholesky(temp_sp); C.Invert(); temp_sp.AddTp2(1.0, C, kTrans, 0.0); this->Mat().CopyFromSp(temp_sp); // was previously just: CuSpMatrix::Invert(). } } template bool CuMatrixBase::ApproxEqual(const CuMatrixBase &other, float tol) const { CuMatrix diff(*this); diff.AddMat(-1.0, other); return (diff.FrobeniusNorm() <= tol * (*this).FrobeniusNorm()); } template Real TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, MatrixTransposeType trans) { if (A.num_rows_ == 0) { KALDI_ASSERT(B.num_rows_ == 0); return 0.0; } Real result = 0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; // the sizes of result_vec must match what we // call the kernels with, in cu-kernels.cu CuVector result_vec(trans == kTrans ? 4 : 2, kUndefined); if (trans == kNoTrans) { KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows()); cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); } else { KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); } CU_SAFE_CALL(cudaGetLastError()); Vector result_cpu(result_vec); // copying from CUDA faster than summing in CUDA. result = result_cpu.Sum(); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { result = TraceMatMat(A.Mat(), B.Mat(), trans); } return result; } template float TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, MatrixTransposeType trans); template double TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, MatrixTransposeType trans); template void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; if (v.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { const Real* v_data = v.Data(); CU_SAFE_CALL(cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyDeviceToDevice)); } else { CU_SAFE_CALL(cudaMemcpy2D(data_, stride_ * sizeof(Real), v.Data(), num_cols_*sizeof(Real), num_cols_*sizeof(Real), num_rows_, cudaMemcpyDeviceToDevice)); } } else if (v.Dim() == num_cols_) { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // this is a newer kernel where (x,y) dims represent (rows,cols). dim3 dimGrid(n_blocks(NumRows(),CU2DBLOCK), n_blocks(NumCols(),CU2DBLOCK)); cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data()); } else { KALDI_ERR << "Wrong sized arguments"; } CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyRowsFromVec(v.Vec()); } } template void CuMatrixBase::CopyRowsFromVec(const VectorBase &v) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; if (v.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { const Real* v_data = v.Data(); cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyHostToDevice); } else { const Real *v_data = v.Data(); for (MatrixIndexT r = 0; r < num_rows_; r++) { Real *row_data = RowData(r); cudaMemcpy(row_data, v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); v_data += num_cols_; } } } else if (v.Dim() == num_cols_) { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // This is a newer kernel where x corresponds to NumRows() and y to NumCols(). dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data()); CU_SAFE_CALL(cudaGetLastError()); /* const Real *v_data = v.Data(); for (MatrixIndexT r = 0; r < num_rows_; r++) cudaMemcpy(RowData(r), v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); */ } else { KALDI_ERR << "Wrong sized arguments"; } CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyRowsFromVec(v); } } template void CuMatrixBase::CopyColFromVec(const CuVectorBase &v, const MatrixIndexT col) { KALDI_ASSERT(v.Dim() == num_rows_ && static_cast(col) < static_cast(num_cols_)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(NumRows(), CU1DBLOCK)); cuda_copy_col_from_vec(dimGrid, dimBlock, data_, v.Data(), col, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyColFromVec(v.Vec(), col); } } template void CuMatrixBase::ApplyPow(Real power) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyPow(power); } } template void CuMatrixBase::ApplyHeaviside() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyHeaviside(); } } template void CuMatrixBase::ApplyExp() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_apply_exp(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyExp(); } } template void CuMatrixBase::ApplyFloor(Real floor_val) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyFloor(floor_val); } } template void CuMatrixBase::ApplyCeiling(Real ceiling_val) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().ApplyCeiling(ceiling_val); } } template void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; if (mat.Stride() == mat.NumCols()) { cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost); vec_data += mat.NumCols(); } } CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim.Elapsed()); } else #endif { CopyRowsFromMat(mat.Mat()); } } // Instantiate the template above. template void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat); template void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat); template void CuMatrixBase::CopyCols(const CuMatrixBase &src, const std::vector &reorder) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(reorder.size()) == NumCols()); KALDI_ASSERT(NumRows() == src.NumRows()); #ifdef KALDI_PARANOID MatrixIndexT src_cols = src.NumCols(); for (size_t i = 0; i < reorder.size(); i++) KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols); #endif CuArray cuda_reorder(reorder); Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // This kernel, as it is newer has the (x,y) dims as (rows,cols). dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyCols(src.Mat(), reorder); } } template void CuMatrixBase::CopyCols(const CuMatrixBase &src, const CuArray &reorder) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(reorder.Dim() == NumCols()); KALDI_ASSERT(NumRows() == src.NumRows()); Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // This kernel, as it is newer has the (x,y) dims as (rows,cols). dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { std::vector reorder_cpu; reorder.CopyToVec(&reorder_cpu); Mat().CopyCols(src.Mat(), reorder_cpu); } } template void CuMatrixBase::CopyRows(const CuMatrixBase &src, const std::vector &reorder) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(reorder.size()) == NumRows()); KALDI_ASSERT(NumCols() == src.NumCols()); #ifdef KALDI_PARANOID MatrixIndexT src_rows = src.NumRows(); for (size_t i = 0; i < reorder.size(); i++) KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_rows); #endif CuArray cuda_reorder(reorder); Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // This kernel, as it is newer has the (x,y) dims as (rows,cols). dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyRows(src.Mat(), reorder); } } template void CuMatrixBase::SumColumnRanges(const CuMatrixBase &src, const CuArray &indices) { KALDI_ASSERT(static_cast(indices.Dim()) == NumCols()); KALDI_ASSERT(NumRows() == src.NumRows()); if (NumRows() == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // This kernel, as it is newer has the (x,y) dims as (rows,cols). dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), src.Dim(), indices.Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // Implement here for the CPU.. int32 num_rows = this->num_rows_, num_cols = this->num_cols_, this_stride = this->stride_, src_stride = src.stride_; Real *data = this->data_; const Real *src_data = src.data_; const Int32Pair *indices_data = indices.Data(); for (int32 row = 0; row < num_rows; row++) { for (int32 col = 0; col < num_cols; col++) { int32 start_col = indices_data[col].first, end_col = indices_data[col].second; Real sum = 0.0; for (int32 src_col = start_col; src_col < end_col; src_col++) sum += src_data[row * src_stride + src_col]; data[row * this_stride + col] = sum; } } } } template void CuMatrixBase::CopyLowerToUpper() { KALDI_ASSERT(num_cols_ == num_rows_); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK), n_blocks(this->num_cols_, CU2DBLOCK)); cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyLowerToUpper(); } } template void CuMatrixBase::CopyUpperToLower() { KALDI_ASSERT(num_cols_ == num_rows_); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK), n_blocks(this->num_cols_, CU2DBLOCK)); cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { Mat().CopyUpperToLower(); } } template Real CuMatrixBase::Sum() const { CuVector row_sum(NumCols()); row_sum.AddRowSumMat(1.0, *this, 0.0); return row_sum.Sum(); } template Real CuMatrixBase::Trace(bool check_square) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; if (check_square) KALDI_ASSERT(this->num_rows_ == this->num_cols_); MatrixIndexT dim = std::min(this->num_rows_, this->num_cols_); CuVector tmp(1, kUndefined); // for result. int dimBlock(CU1DBLOCK); int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1); CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed()); return tmp(0); } else #endif { return Mat().Trace(check_square); } } template void CuMatrixBase::SetRandn() { if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuRand tmp; tmp.RandGaussian(this); } else #endif { Mat().SetRandn(); } } template void CuMatrixBase::SetRandUniform() { if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuRand tmp; tmp.RandUniform(this); } else #endif { Mat().SetRandUniform(); } } template void Matrix::Swap(CuMatrix *mat) { mat->Swap(this); } // instantiate the template above. template void Matrix::Swap(CuMatrix *mat); template void Matrix::Swap(CuMatrix *mat); /// Copy constructor from another type. template template CuMatrix::CuMatrix(const CuMatrixBase & M, MatrixTransposeType trans) : CuMatrixBase() { if (trans == kNoTrans) { Resize(M.NumRows(), M.NumCols()); this->CopyFromMat(M); } else { Resize(M.NumCols(), M.NumRows()); this->CopyFromMat(M, kTrans); } } // Instantiate this constructor for float->double and double->float. template CuMatrix::CuMatrix(const CuMatrixBase & M, MatrixTransposeType trans); template CuMatrix::CuMatrix(const CuMatrixBase & M, MatrixTransposeType trans); /* template CuMatrix::DeriveLastLayerComponent(int32 i, int32 label, Real weight, Real this_prob) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { cuda_derive_last_layer_component(i, label, weight, this_prob); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } #endif { } } */ template void CuMatrix::Transpose() { if (this->num_rows_ == 0) return; #if HAVE_CUDA == 1 if (this->num_rows_ == this->num_cols_ && CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // (x,y) indices will be (row of *this, col of *this) dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK), n_blocks(this->num_cols_, CU2DBLOCK)); cuda_transpose_matrix(dimGrid, dimBlock, this->data_, this->Dim()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { CuMatrix tmp(*this, kTrans); *this = tmp; } } // Version of AddMatMat where 2nd argument is of type CuBlockMatrix. template void CuMatrixBase::AddMatBlock( Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, const CuBlockMatrix &B, MatrixTransposeType transB, Real beta) { // Check dimensions int32 A_num_rows = A.NumRows(), A_num_cols = A.NumCols(), A_row_stride = A.Stride(), A_col_stride = 1, B_num_rows = B.NumRows(), B_num_cols = B.NumCols(); if (transA == kTrans) { std::swap(A_num_rows, A_num_cols); std::swap(A_row_stride, A_col_stride); } if (transB == kTrans) { std::swap(B_num_rows, B_num_cols); } // At this point the {A,B}_{rows,cols} variables are // after any transposition. KALDI_ASSERT(NumRows() == A_num_rows && NumCols() == B_num_cols); KALDI_ASSERT(A_num_cols == B_num_rows); int32 B_num_blocks = B.NumBlocks(); if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; MatrixDim this_dim = Dim(); dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // (x,y) indices will be (row of *this, block of B) dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(B_num_blocks, CU2DBLOCK)); cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(), A_num_rows, A_num_cols, A_row_stride, A_col_stride, B.CuData(), B_num_blocks, alpha, beta, (transB == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // "row_offset" and "col_offset" are offsets into B (or into B^T, if // transB == kTrans). int32 row_offset = 0, col_offset = 0; for (int32 b = 0; b < B_num_blocks; b++) { const CuSubMatrix this_block = B.Block(b); int32 this_num_rows = this_block.NumRows(), this_num_cols = this_block.NumCols(); if (transB == kTrans) std::swap(this_num_rows, this_num_cols); CuSubMatrix this_part(*this, 0, num_rows_, col_offset, this_num_cols); CuSubMatrix A_part = (transA == kNoTrans ? CuSubMatrix(A, 0, num_rows_, row_offset, this_num_rows) : CuSubMatrix(A, row_offset, this_num_rows, 0, num_rows_)); this_part.AddMatMat(alpha, A_part, transA, this_block, transB, beta); row_offset += this_num_rows; col_offset += this_num_cols; } // Note: the values being compared below are all after applying any // transposition to B. KALDI_ASSERT(row_offset == B_num_rows && col_offset == B_num_cols); } } template void CuMatrixBase::AddElements(Real alpha, const std::vector >& input) { // Checks the dimension. MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; for (int32 i = 0; i < input.size(); ++i) { KALDI_ASSERT(input[i].row < num_rows && input[i].row >= 0 && input[i].column < num_cols && input[i].column >= 0); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement)); CU_SAFE_CALL(cudaMemcpy(addr, input.data(), input.size() * sizeof(MatrixElement), cudaMemcpyHostToDevice)); Timer tim; int dimBlock(CU1DBLOCK); int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(), alpha, (MatrixElement*)addr, input.size()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().Free(addr); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { for (int32 i = 0; i < input.size(); i++) { (*this)(input[i].row, input[i].column) += alpha * input[i].weight; } } } template void CuMatrixBase::Lookup(const std::vector &indices, std::vector *output) const { // Checks the dimension. MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; for (int32 i = 0; i < indices.size(); ++i) { KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 && indices[i].second < num_cols && indices[i].second >= 0); } // Checks the pointer. KALDI_ASSERT(output != NULL); // Resizes the output vector. output->resize(indices.size()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuArray cuda_indices(indices); CuArray cuda_output(output->size()); Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(), cuda_indices.Data(), indices.size(), cuda_output.Data()); CU_SAFE_CALL(cudaGetLastError()); cuda_output.CopyToVec(output); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { for (int32 i = 0; i < indices.size(); i++) { (*output)[i] = (*this)(indices[i].first, indices[i].second); } } } template void CuMatrixBase::EqualElementMask(const CuMatrixBase &mat, CuMatrix *mask) const { // Check the inputs: KALDI_ASSERT(mat.NumRows() == NumRows() && mat.NumCols() == NumCols()); KALDI_ASSERT(mask != NULL); // Resizes the output matrix: mask->Resize(NumRows(), NumCols(), kSetZero); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(), mask->Data(), this->Dim(), mat.Stride(), mask->Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { for (int32 r = 0; r < NumRows(); r++) { for (int32 c = 0; c < NumCols(); c++) { (*mask)(r,c) = ((*this)(r,c) == mat(r,c) ? 1.0 : 0.0); } } } } /** * Print the matrix to stream */ template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat) { Matrix temp(mat.NumRows(), mat.NumCols()); mat.CopyToMat(&temp); out << temp; return out; } // instantiate the template template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); // Instantiate classes CuMatrix and CuMatrixBase for float and double. template class CuMatrix; template class CuMatrix; template class CuMatrixBase; template class CuMatrixBase; } // namespace kaldi