// cudamatrix/cublas-wrappers.h

// Copyright 2013  Johns Hopkins University (author: Daniel Povey);

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//  http://www.apache.org/licenses/LICENSE-2.0

// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_CUBLAS_WRAPPERS_H_
#define KALDI_MATRIX_CUBLAS_WRAPPERS_H_ 1

// Do not include this file directly.  It is to be included
// by .cc files in this directory.

namespace kaldi {
#if HAVE_CUDA == 1

inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) {
  cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
}
inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) {
  cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
}
inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) {
  cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
}
inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) {
  cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
}
inline void cublas_syrk(char uplo, char trans, int n, int k,
                        float alpha, const float *A, int lda,
                        float beta, float *C, int ldc) {
  cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
}
inline void cublas_syrk(char uplo, char trans, int n, int k,
                        double alpha, const double *A, int lda,
                        double beta, double *C, int ldc) {
  cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
}
inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) {
  return cublasSdot(n, x, incx, y, incy);
}
inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) {
  return cublasDdot(n, x, incx, y, incy);
}
inline float cublas_asum(int n, const float* x, int incx) {
  return cublasSasum(n, x, incx);
}
inline double cublas_asum(int n, const double* x, int incx) {
  return cublasDasum(n, x, incx);
}
inline float cublas_nrm2(int n, const float* x, int incx) {
  return cublasSnrm2(n, x, incx);
}
inline double cublas_nrm2(int n, const double* x, int incx) {
  return cublasDnrm2(n, x, incx);
}
inline void cublas_copy(int n, const float* x, int incx,
                        float* y, int incy) {
  cublasScopy(n,x,incx,y,incy);
}
inline void cublas_copy(int n, const double* x, int incx,
                          double* y, int incy) {
  cublasDcopy(n,x,incx,y,incy);
}
inline void cublas_scal(int n, float alpha, float* mat, int incx) {
  cublasSscal(n, alpha, mat, incx);
}
inline void cublas_scal(int n, double alpha, double* mat, int incx) {
  cublasDscal(n, alpha, mat, incx);
}

inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) {
  cublasSaxpy(n, alpha, x, incx, y, incy);
}
inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) {
  cublasDaxpy(n, alpha, x, incx, y, incy);
}
inline void cublas_gemv(char trans, int m, int n, float alpha,
                        const float* A, int lda, const float* x,
                        int incx, float beta, float* y, int incy) {
  cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
}
inline void cublas_gemv(char trans, int m, int n, double alpha,
                        const double* A, int lda, const double* x,
                        int incx, double beta, double* y, int incy) {
  cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
}

inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x,
                        int incx, float beta, float *y, int incy) {
  cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
}
inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x,
                        int incx, double beta, double *y, int incy) {
  cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
}

// Use caution with these, the 'transpose' argument is the opposite of what it
// should really be, due to CUDA storing things in column major order.  We also
// had to switch 'l' to 'u'; we view our packed matrices as lower-triangular,
// row-by-row, but CUDA views the same layout as upper-triangular,
// column-by-column.
inline void cublas_tpmv(char trans, int n,
                        const float* Ap, float* x, int incx) {
  return cublasStpmv('u', trans, 'n', n, Ap, x, incx);
}
inline void cublas_tpmv(char trans, int n, const double* Ap,
                        double* x,int incx) {
  return cublasDtpmv('u', trans, 'n', n, Ap, x, incx);
}

inline void cublas_spr(char uplo, int n, float alpha, const float *x,
                      int incx, float *AP) {
  cublasSspr(uplo, n, alpha, x, incx, AP);
}
inline void cublas_spr(char uplo, int n, double alpha, const double *x,
                      int incx, double *AP) {
  cublasDspr(uplo, n, alpha, x, incx, AP);
}

#endif
}
// namespace kaldi

#endif