// cudamatrix/cu-math.cc // Copyright 2009-2012 Karel Vesely // Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "util/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" namespace kaldi { namespace cu { /* * templated functions wrapping the ANSI-C CUDA kernel functions */ template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, Real lr) { KALDI_ASSERT(SameDim(*weight, *grad)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK)); cuda_regularize_l1(dimGrid, dimBlock, weight->data_, grad->data_, l1, lr, weight->Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { MatrixBase &weight2 = weight->Mat(); MatrixBase &grad2 = grad->Mat(); for(MatrixIndexT r=0; r 0.0) ^ (before > 0.0)) { weight2(r, c) = 0.0; grad2(r, c) = 0.0; } else { weight2(r, c) -= l1_signed; } } } } } template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt) { KALDI_ASSERT(src.NumCols() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; /* Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); */ /* * Let's use blocksize 4 x 128 (512 threads/block) * and extend the randomizable matrices to: col 4*65535, row 128*65535 * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) */ dim3 dimBlock(4, 128); dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128)); /* */ MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim(); MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim(); cuda_randomize(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_idx.Data(), dimtgt, dimsrc); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // randomize in CPU const MatrixBase &srcmat = src.Mat(); const int32 *copy_from_idxvec = copy_from_idx.Data(); MatrixBase &tgtmat = tgt->Mat(); for(int32 i=0; i void Splice(const CuMatrix &src, const CuArray &frame_offsets, CuMatrix *tgt) { KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); cuda_splice(dimGrid, dimBlock, tgt->data_, src.data_, frame_offsets.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // expand in CPU const MatrixBase &srcmat = src.Mat(); const int32 *frame_offsetvec = frame_offsets.Data(); int32 dim = frame_offsets.Dim(); MatrixBase &tgtmat = tgt->Mat(); // for(int32 r=0; r < tgtmat.NumRows(); r++) { for(int32 off=0; off < dim; off++) { int32 r_off = r + frame_offsetvec[off]; if(r_off < 0) r_off = 0; if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1; memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols()); } } } } template void Copy(const CuMatrix &src, const CuArray ©_from_indices, CuMatrix *tgt) { KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); cuda_copy(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_indices.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { // expand in CPU const MatrixBase &srcmat = src.Mat(); const int32 *copy_from_indicesvec = copy_from_indices.Data(); int32 dim = copy_from_indices.Dim(); MatrixBase &tgtmat = tgt->Mat(); // for(int32 r = 0; r < tgtmat.NumRows(); r++) { for(int32 c = 0; c < dim; c++) { tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]); } } } } // instantiate the templates. template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, float l1, float lr); template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, double l1, double lr); template void Splice(const CuMatrix &src, const CuArray &frame_offsets, CuMatrix *tgt); template void Splice(const CuMatrix &src, const CuArray &frame_offsets, CuMatrix *tgt); template void Copy(const CuMatrix &src, const CuArray ©_from_indices, CuMatrix *tgt); template void Copy(const CuMatrix &src, const CuArray ©_from_indices, CuMatrix *tgt); template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt); template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt); } //namespace cu } //namespace kaldi