diff --git a/DESCRIPTION b/DESCRIPTION
index f31d101..b9ea3e5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,10 +1,12 @@
 Package: learner
 Type: Package
 Title: Latent Space-Based Transfer Learning
-Version: 0.2.0
+Version: 1.0.0
 Authors@R: c(person("Sean", "McGrath", role = c("aut", "cre"), 
                      email = "sean.mcgrath514@gmail.com",
                      comment = c(ORCID = "0000-0002-7281-3516")),
+              person("Ryan", "ODea", role = c("aut"),
+                     email = "ryanodea@hsph.harvard.edu"),
               person("Cenhao", "Zhu", role = c("aut"),
                       email = "cenhaoz@mit.edu"),
               person("Rui", "Duan", role = c("aut"), 
@@ -22,7 +24,10 @@ LazyData: true
 RoxygenNote: 7.3.2
 URL: https://github.com/stmcg/learner
 BugReports: https://github.com/stmcg/learner/issues
-Imports: doParallel, foreach, ScreeNOT
+Imports: ScreeNOT,
+         Rcpp (>= 1.0.11),
+         RcppEigen
+LinkingTo: Rcpp, RcppEigen
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index adc0742..1ab3843 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,5 +3,6 @@
 export(cv.learner)
 export(dlearner)
 export(learner)
-import(doParallel)
-import(foreach)
+import(RcppEigen)
+importFrom(Rcpp,sourceCpp)
+useDynLib(learner, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
index 1eb9ec0..84fe99e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,6 @@
-### learner version 0.2.0 (TBD)
+### learner version 1.0.0 (TBD)
 
+* Implemented the optimization algorithm in Rcpp, resulting in significant computational speed-ups.
 * Fixed a bug in `cv.learner` when `Y_target` has missing entries.
 
 ### learner version 0.1.0 (2025-01-08)
diff --git a/R/RcppExports.R b/R/RcppExports.R
new file mode 100644
index 0000000..ec831f9
--- /dev/null
+++ b/R/RcppExports.R
@@ -0,0 +1,15 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+learner_cpp <- function(Y_source, Y_target, r, lambda1, lambda2, step_size, max_iter, threshold, max_value) {
+    .Call(`_learner_learner_cpp`, Y_source, Y_target, r, lambda1, lambda2, step_size, max_iter, threshold, max_value)
+}
+
+cv_learner_cpp <- function(Y_source, Y_target, lambda1_all, lambda2_all, step_size, n_folds, max_iter, threshold, n_cores, r, max_value, index_set) {
+    .Call(`_learner_cv_learner_cpp`, Y_source, Y_target, lambda1_all, lambda2_all, step_size, n_folds, max_iter, threshold, n_cores, r, max_value, index_set)
+}
+
+omp_max_threads <- function() {
+    .Call(`_learner_omp_max_threads`)
+}
+
diff --git a/R/cv_learner.R b/R/cv_learner.R
deleted file mode 100644
index a16e93a..0000000
--- a/R/cv_learner.R
+++ /dev/null
@@ -1,156 +0,0 @@
-#' Cross-validation for LEARNER
-#'
-#' This function performs k-fold cross-validation to select the nuisance parameters \eqn{(\lambda_1, \lambda_2)} for \code{\link{learner}}.
-#'
-#' @param Y_target matrix containing the target population data, as in \code{\link{learner}}
-#' @param Y_source matrix containing the source population data, as in \code{\link{learner}}
-#' @param r (optional) integer specifying the rank of the knowledge graphs, as in \code{\link{learner}}
-#' @param lambda_1_all vector of numerics specifying the candidate values of \eqn{\lambda_1} (see Details)
-#' @param lambda_2_all vector of numerics specifying the candidate values of \eqn{\lambda_2} (see Details)
-#' @param step_size numeric scalar specifying the step size for the Newton steps in the numerical optimization algorithm, as in \code{\link{learner}}
-#' @param n_folds an integer specify the number of cross-validation folds. The default is \code{4}.
-#' @param n_cores an integer specifying the number of CPU cores in parallelization. Parallelization is performed across the different candidate \eqn{(\lambda_1, \lambda_2)} pairs. The default is \code{1}, i.e., no parallelization.
-#' @param control a list of parameters for controlling the stopping criteria for the numerical optimization algorithm, as in \code{\link{learner}}.
-#'
-#' @return A list with the following elements:
-#' \item{lambda_1_min}{value of \eqn{\lambda_1} with the smallest MSE}
-#' \item{lambda_2_min}{value of \eqn{\lambda_2} with the smallest MSE}
-#' \item{mse_all}{matrix containing MSE value for each \eqn{(\lambda_1, \lambda_2)} pair. The rows correspond to the \eqn{\lambda_1} values, and the columns correspond to the \eqn{\lambda_2} values.}
-#' \item{r}{rank value used.}
-#'
-#' @details
-#' Given sets of candidate values of \eqn{\lambda_1} and \eqn{\lambda_2}, this function performs k-fold cross-validation to select the pair \eqn{(\lambda_1, \lambda_2)} with the smallest held out error. This function randomly partitions the entries of \code{Y_target} into \eqn{k} (approximately) equally sized subsamples. The training data sets are obtained by removing one of the \eqn{k} subsamples and the corresponding test data sets are based on the held out subsamples. The \code{\link{learner}} function is applied to each training data set. The held out error is computed by the mean squared error comparing the entries in the test data sets with those imputed from the LEARNER estimates. See McGrath et al. (2024) for further details.
-#'
-#' @references
-#' McGrath, S., Zhu, C,. Guo, M. and Duan, R. (2024). \emph{LEARNER: A transfer learning method for low-rank matrix estimation}. arXiv preprint	arXiv:2412.20605.
-#'
-#' @examples
-#' res <- cv.learner(Y_source = dat_highsim$Y_source,
-#'                   Y_target = dat_highsim$Y_target,
-#'                   lambda_1_all = c(1, 10, 100),
-#'                   lambda_2_all = c(1, 10, 100),
-#'                   step_size = 0.003)
-#'
-#'
-#' @import foreach
-#' @import doParallel
-#'
-#' @export
-
-
-
-cv.learner <- function(Y_source, Y_target, r, lambda_1_all, lambda_2_all,
-                       step_size, n_folds = 4, n_cores = 1, control = list()){
-
-  ## Error catching
-  if (!identical(dim(Y_source), dim(Y_target))){
-    stop('Y_source and Y_target must have the same dimensions')
-  }
-  if (any(is.na(Y_source))){
-    stop('Y_source cannot have NA values.')
-  }
-
-  ## Setting the rank and other parameters
-  p <- nrow(Y_source)
-  q <- ncol(Y_source)
-  if (missing(r)){
-    max_rank <- min(p, q) / 3
-    r <- max(ScreeNOT::adaptiveHardThresholding(Y = Y_source, k = max_rank)$r, 1)
-  }
-  if (!('max_iter' %in% names(control))){
-    control$max_iter <- 100
-  }
-  if (!('threshold' %in% names(control))){
-    control$threshold <- 0.001
-  }
-  if (!('max_value' %in% names(control))){
-    control$max_value <- 10
-  }
-
-  svd_source <- svd(Y_source, nu = r, nv = r)
-  P_U <- diag(rep(1, p)) - svd_source$u %*% t(svd_source$u)
-  P_V <- diag(rep(1, q)) - svd_source$v %*% t(svd_source$v)
-  U_init <- svd_source$u %*% diag(sqrt(svd_source$d[1:r]), nrow = r, ncol = r)
-  V_init <- svd_source$v %*% diag(sqrt(svd_source$d[1:r]), nrow = r, ncol = r)
-
-  if (n_folds < 2){
-    stop('n_folds must be 2 or greater.')
-  }
-  n_lambda_1 <- length(lambda_1_all)
-  n_lambda_2 <- length(lambda_2_all)
-
-  ## Creating training and testing data sets
-  available_indices <- 1:(p * q)
-  available_indices <- available_indices[!(available_indices %in% which(is.na(Y_target)))]
-  n_indices <- length(available_indices)
-  indices <- sample(available_indices, size = n_indices, replace = FALSE)
-
-  index_set <- vector(mode = "list", length = n_folds)
-  for (fold in 1:n_folds){
-    index_set[[fold]] <- indices[floor((fold - 1) * n_indices / n_folds + 1):floor(fold * n_indices / n_folds)]
-  }
-
-  ## Applying LEARNER for each candidate lambda_1 and lambda_2 values
-  if (n_cores > 1){
-    ## Parallelization
-    registerDoParallel(cores = n_cores)
-    mse_all <- foreach(lambda_1_ind = 1:n_lambda_1, .combine = 'cbind') %:%
-      foreach(lambda_2_ind = 1:n_lambda_2, .combine = 'c') %dopar% {
-        cv.learner_helper(Y_target = Y_target, index_set = index_set, n_folds = n_folds,
-                          lambda1 = lambda_1_all[lambda_1_ind], lambda2 = lambda_2_all[lambda_2_ind],
-                          U_init = U_init, V_init = V_init,
-                          P_U = P_U, P_V = P_V, step_size = step_size,
-                          control = control)
-      }
-  } else {
-    ## Sequential
-    mse_all <- foreach(lambda_1_ind = 1:n_lambda_1, .combine = 'cbind') %:%
-      foreach(lambda_2_ind = 1:n_lambda_2, .combine = 'c') %do% {
-        cv.learner_helper(Y_target = Y_target, index_set = index_set, n_folds = n_folds,
-                          lambda1 = lambda_1_all[lambda_1_ind], lambda2 = lambda_2_all[lambda_2_ind],
-                          U_init = U_init, V_init = V_init,
-                          P_U = P_U, P_V = P_V, step_size = step_size,
-                          control = control)
-      }
-  }
-  mse_all <- t(mse_all)
-
-  min_ind <- which(mse_all == min(mse_all, na.rm = TRUE), arr.ind = TRUE)
-  lambda_1_min <- lambda_1_all[min_ind[1]]
-  lambda_2_min <- lambda_2_all[min_ind[2]]
-
-  return(list(
-    lambda_1_min = lambda_1_min,
-    lambda_2_min = lambda_2_min,
-    mse_all = mse_all,
-    r = r
-  ))
-}
-
-cv.learner_helper <- function(Y_target, index_set, n_folds,
-                              lambda1, lambda2,
-                              U_init, V_init, P_U, P_V,
-                              step_size, control){
-  norm_temp <- 0
-  p <- nrow(Y_target); q <- ncol(Y_target)
-  for (fold in 1:n_folds){
-    Y_training <- Y_target
-    myset <- index_set[[fold]]
-    Y_training[myset] <- NA
-    perc_nonmissing <- sum(!is.na(Y_training)) / (p * q)
-
-    fit <- try(learner_helper(theta_hat = Y_training,
-                              lambda1 = lambda1,
-                              lambda2 = lambda2,
-                              U_init = U_init, V_init = V_init,
-                              P_U = P_U, P_V = P_V, missing = TRUE,
-                              perc_nonmissing = perc_nonmissing,
-                              step_size = step_size, control = control))
-    if ('try-error' %in% class(fit)){
-      return(NA)
-    } else {
-      norm_temp <- norm_temp + sum((fit$learner_estimate[myset] - Y_target[myset])^2)
-    }
-  }
-  return(norm_temp)
-}
diff --git a/R/dlearner.R b/R/dlearner.R
index 1885b82..0454de2 100644
--- a/R/dlearner.R
+++ b/R/dlearner.R
@@ -54,8 +54,9 @@ dlearner <- function(Y_source, Y_target, r){
   }
 
   svd_source <- svd(Y_source, nu = r, nv = r)
-  dlearner_estimate <- (svd_source$u %*% t(svd_source$u)) %*%
-    Y_target %*% (svd_source$v %*% t(svd_source$v))
+  dlearner_estimate <- svd_source$u %*%
+    (t(svd_source$u) %*% Y_target %*% svd_source$v) %*%
+    t(svd_source$v)
 
   colnames(dlearner_estimate) <- colnames(Y_source)
   rownames(dlearner_estimate) <- rownames(Y_source)
diff --git a/R/globals.R b/R/globals.R
index 5af0ca7..7012eb0 100644
--- a/R/globals.R
+++ b/R/globals.R
@@ -2,3 +2,14 @@ if(getRversion() >= "2.15.1"){
   # To remove 'no visible binding for global variable ...' notes
   utils::globalVariables(c('lambda_1_ind', 'lambda_2_ind'))
 }
+
+.onAttach <- function(libname, pkgname) {
+  max_threads <- tryCatch(omp_max_threads(), error = function(e) 1)
+  if (max_threads < 2) {
+    packageStartupMessage("*******\n
+                          This installation of learner has not detected OpenMP support\n
+                          It will still work but will not support multithreading via the `n_cores` argument
+                          If you plan to use multithreading, please ensure you are using R>=3.4.0 and have OpenMP installed\n
+                          *******")
+  }
+}
diff --git a/R/learner-package.R b/R/learner-package.R
new file mode 100644
index 0000000..c9c0ad8
--- /dev/null
+++ b/R/learner-package.R
@@ -0,0 +1,6 @@
+## usethis namespace: start
+#' @import RcppEigen
+#' @importFrom Rcpp sourceCpp
+#' @useDynLib learner, .registration = TRUE
+## usethis namespace: end
+NULL
diff --git a/R/learner.R b/R/learner.R
index 0df2bc7..5116111 100644
--- a/R/learner.R
+++ b/R/learner.R
@@ -1,3 +1,84 @@
+#' Cross-validation for LEARNER
+#'
+#' This function performs k-fold cross-validation to select the nuisance parameters \eqn{(\lambda_1, \lambda_2)} for \code{\link{learner}}.
+#'
+#' @param Y_target matrix containing the target population data, as in \code{\link{learner}}
+#' @param Y_source matrix containing the source population data, as in \code{\link{learner}}
+#' @param r (optional) integer specifying the rank of the knowledge graphs, as in \code{\link{learner}}
+#' @param lambda_1_all vector of numerics specifying the candidate values of \eqn{\lambda_1} (see Details)
+#' @param lambda_2_all vector of numerics specifying the candidate values of \eqn{\lambda_2} (see Details)
+#' @param step_size numeric scalar specifying the step size for the Newton steps in the numerical optimization algorithm, as in \code{\link{learner}}
+#' @param n_folds an integer specify the number of cross-validation folds. The default is \code{4}.
+#' @param n_cores an integer specifying the number of CPU cores in OpenMP parallelization. Parallelization is performed across the different candidate \eqn{(\lambda_1, \lambda_2)} pairs. The default is \code{1}, i.e., no parallelization.
+#' @param control a list of parameters for controlling the stopping criteria for the numerical optimization algorithm, as in \code{\link{learner}}.
+#'
+#' @return A list with the following elements:
+#' \item{lambda_1_min}{value of \eqn{\lambda_1} with the smallest MSE}
+#' \item{lambda_2_min}{value of \eqn{\lambda_2} with the smallest MSE}
+#' \item{mse_all}{matrix containing MSE value for each \eqn{(\lambda_1, \lambda_2)} pair. The rows correspond to the \eqn{\lambda_1} values, and the columns correspond to the \eqn{\lambda_2} values.}
+#' \item{r}{rank value used.}
+#'
+#' @details
+#' Given sets of candidate values of \eqn{\lambda_1} and \eqn{\lambda_2}, this function performs k-fold cross-validation to select the pair \eqn{(\lambda_1, \lambda_2)} with the smallest held out error. This function randomly partitions the entries of \code{Y_target} into \eqn{k} (approximately) equally sized subsamples. The training data sets are obtained by removing one of the \eqn{k} subsamples and the corresponding test data sets are based on the held out subsamples. The \code{\link{learner}} function is applied to each training data set. The held out error is computed by the mean squared error comparing the entries in the test data sets with those imputed from the LEARNER estimates. See McGrath et al. (2024) for further details.
+#'
+#' @references
+#' McGrath, S., Zhu, C,. Guo, M. and Duan, R. (2024). \emph{LEARNER: A transfer learning method for low-rank matrix estimation}. arXiv preprint	arXiv:2412.20605.
+#'
+#' @examples
+#' res <- cv.learner(Y_source = dat_highsim$Y_source,
+#'                   Y_target = dat_highsim$Y_target,
+#'                   lambda_1_all = c(1, 10, 100),
+#'                   lambda_2_all = c(1, 10, 100),
+#'                   step_size = 0.003)
+#'
+#'
+#'
+#' @export
+cv.learner <- function(Y_source, Y_target, r, lambda_1_all, lambda_2_all,
+                       step_size, n_folds = 4, n_cores = 1, control = list()) {
+  ## Error catching
+  if (!identical(dim(Y_source), dim(Y_target))){
+    stop('Y_source and Y_target must have the same dimensions')
+  }
+  if (any(is.na(Y_source))){
+    stop('Y_source cannot have NA values.')
+  }
+
+  if (missing(r)){
+    p <- nrow(Y_source); q <- ncol(Y_source)
+    max_rank <- min(p, q) / 3
+    r <- max(ScreeNOT::adaptiveHardThresholding(Y = Y_source, k = max_rank)$r, 1)
+  }
+  if (!('max_iter' %in% names(control))){
+    control$max_iter <- 100
+  }
+  if (!('threshold' %in% names(control))){
+    control$threshold <- 0.001
+  }
+  if (!('max_value' %in% names(control))){
+    control$max_value <- 10
+  }
+
+  if (n_folds < 2){
+    stop('n_folds must be 2 or greater.')
+  }
+
+  ## Creating training and testing data sets
+  available_indices <- which(!is.na(Y_target))
+  n_indices <- length(available_indices)
+  indices <- sample(available_indices, size = n_indices, replace = FALSE) - 1
+
+  index_set <- vector(mode = "list", length = n_folds)
+  for (fold in 1:n_folds){
+    index_set[[fold]] <- indices[floor((fold - 1) * n_indices / n_folds + 1):floor(fold * n_indices / n_folds)]
+  }
+
+  result <- cv_learner_cpp(Y_source, Y_target, lambda_1_all, lambda_2_all, step_size,
+                           n_folds, control$max_iter, control$threshold, n_cores, r, control$max_value,
+                           index_set)
+  return(result)
+}
+
 #' Latent space-based transfer learning
 #'
 #' This function applies the LatEnt spAce-based tRaNsfer lEaRning (LEARNER) method (McGrath et al. 2024) to leverage data from a source population to improve
@@ -51,7 +132,7 @@
 #'
 #' @export
 learner <- function(Y_source, Y_target, r, lambda_1, lambda_2, step_size,
-                    control = list()){
+                    control = list()) {
   ## Error catching
   if (!identical(dim(Y_source), dim(Y_target))){
     stop('Y_source and Y_target must have the same dimensions')
@@ -66,12 +147,8 @@ learner <- function(Y_source, Y_target, r, lambda_1, lambda_2, step_size,
     stop('lambda_2 must be a scalar. See the function cv.learner for selecting a suitable lambda_2 value.')
   }
 
-  ## Setting the rank and other parameters
-  missing <- any(is.na(Y_target))
-  p <- nrow(Y_source)
-  q <- ncol(Y_source)
-  perc_nonmissing <- sum(!is.na(Y_target)) / (p * q)
   if (missing(r)){
+    p <- nrow(Y_source); q <- ncol(Y_source)
     max_rank <- min(p, q) / 3
     r <- max(ScreeNOT::adaptiveHardThresholding(Y = Y_source, k = max_rank)$r, 1)
   }
@@ -85,114 +162,8 @@ learner <- function(Y_source, Y_target, r, lambda_1, lambda_2, step_size,
     control$max_value <- 10
   }
 
-  svd_source <- svd(Y_source, nu = r, nv = r)
-  P_U <- diag(rep(1, p)) - svd_source$u %*% t(svd_source$u)
-  P_V <- diag(rep(1, q)) - svd_source$v %*% t(svd_source$v)
-  U_init <- svd_source$u %*% diag(sqrt(svd_source$d[1:r]), nrow = r, ncol = r)
-  V_init <- svd_source$v %*% diag(sqrt(svd_source$d[1:r]), nrow = r, ncol = r)
-
-  out <- learner_helper(control = control, U_init = U_init, V_init = V_init, lambda1 = lambda_1,
-                        lambda2 = lambda_2, theta_hat = Y_target, P_U = P_U, P_V = P_V, missing = missing,
-                        perc_nonmissing = perc_nonmissing, step_size = step_size)
-
-  return(out)
-}
-
-learner_helper <- function(control, U_init, V_init, lambda1, lambda2, theta_hat, P_U, P_V, missing,
-                           perc_nonmissing, step_size){
-  U <- U_init
-  V <- V_init
-  max_iter <- control$max_iter
-  threshold <- control$threshold
-  max_value <- control$max_value
-
-  obj_func_init <- f(U = U, V = V, lambda1 = lambda1, lambda2 = lambda2, theta_hat = theta_hat, P_U = P_U, P_V = P_V, missing = missing,
-                     perc_nonmissing = perc_nonmissing)
-  obj_func_best <- obj_func_init; U_best <- U; V_best <- V
-  obj_func <- rep(NA, times = max_iter)
-  U_norm <- norm(U, type = 'F')
-  V_norm <- norm(V, type = 'F')
-
-  for (iter in 1:max_iter){
-    f_delta_U <- f_prime_u(U = U, V = V, lambda1 = lambda1, lambda2 = lambda2, theta_hat = theta_hat, P_U = P_U, missing = missing, perc_nonmissing = perc_nonmissing)
-    f_delta_V <- f_prime_v(U = U, V = V, lambda1 = lambda1, lambda2 = lambda2, theta_hat = theta_hat, P_V = P_V, missing = missing, perc_nonmissing = perc_nonmissing)
-
-    f_delta_U_norm <- norm(f_delta_U, type = 'F')
-    f_delta_V_norm <- norm(f_delta_V, type = 'F')
-
-    U <- U - (step_size * U_norm / f_delta_U_norm) * f_delta_U
-    V <- V - (step_size * V_norm / f_delta_V_norm) * f_delta_V
-
-    obj_func[iter] <- f(U = U, V = V, lambda1 = lambda1, lambda2 = lambda2, theta_hat = theta_hat, P_U = P_U, P_V = P_V, missing = missing, perc_nonmissing = perc_nonmissing)
-    if (obj_func[iter] < obj_func_best){
-      obj_func_best <- obj_func[iter]; U_best <- U; V_best <- V
-    }
-    if (iter > 1){
-      if (abs(obj_func[iter] - obj_func[iter - 1]) < threshold){
-        convergence_criterion <- 1
-        break()
-      }
-      if (obj_func[iter] > max_value * obj_func_init){
-        convergence_criterion <- 3
-        break()
-      }
-    }
-    U_norm <- norm(U, type = 'F')
-    V_norm <- norm(V, type = 'F')
-
-    if (iter == max_iter){
-      convergence_criterion <- 2
-    }
-  }
-  theta_hat <- U_best %*% t(V_best)
-  return(list(learner_estimate = theta_hat, objective_values = obj_func,
-              convergence_criterion = convergence_criterion,
-              r = ncol(U)))
-}
-
-
-norm_2_sq <- function(x){
-  return(sum(x^2))
-}
-
-f <- function(U, V, lambda1, lambda2, theta_hat, P_U, P_V, missing, perc_nonmissing){
-  if (missing){
-    t1 <- sum((U %*% t(V) - theta_hat)^2, na.rm = TRUE) / perc_nonmissing
-  } else {
-    t1 <- norm(U %*% t(V) - theta_hat, type = 'F')^2 / perc_nonmissing
-  }
-  t2 <- lambda1 * norm(P_U %*% U, type = 'F')^2
-  t3 <- lambda1 * norm(P_V %*% V, type = 'F')^2
-  t4 <- lambda2 * norm(t(U) %*% U - t(V) %*% V, type = 'F')^2
-  return(t1 + t2 + t3 + t4)
+  result <- learner_cpp(Y_source, Y_target, r, lambda_1, lambda_2, step_size, control$max_iter, control$threshold, control$max_value)
+  return(result)
 }
 
-f_prime_u <- function(U, V, lambda1, lambda2, theta_hat, P_U, missing, perc_nonmissing){
-  U_tilde <- t(U) %*% U
-  V_tilde <- t(V) %*% V
 
-  if (missing){
-    temp <- U %*% t(V)
-    missing_entries <- which(is.na(theta_hat), arr.ind = T)
-    theta_hat[missing_entries] <- temp[missing_entries]
-  }
-  t1 <- (2 / perc_nonmissing) * (U %*% V_tilde - theta_hat %*% V)
-  t2 <- lambda1 * 2 * P_U %*% U
-  t4 <- lambda2 * 4 * U %*% (U_tilde - V_tilde)
-  return(t1 + t2 + t4)
-}
-
-f_prime_v <- function(U, V, lambda1, lambda2, theta_hat, P_V, missing, perc_nonmissing){
-  U_tilde <- t(U) %*% U
-  V_tilde <- t(V) %*% V
-
-  if (missing){
-    temp <- U %*% t(V)
-    missing_entries <- which(is.na(theta_hat), arr.ind = T)
-    theta_hat[missing_entries] <- temp[missing_entries]
-  }
-  t1 <- (2 / perc_nonmissing) * (V %*% U_tilde - t(theta_hat) %*% U)
-  t3 <- lambda1 * 2 * P_V %*% V
-  t4 <- lambda2 * 4 * V %*% (V_tilde - U_tilde)
-  return(t1 + t3 + t4)
-}
diff --git a/README.Rmd b/README.Rmd
index 9f89d8b..595e3ff 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -39,7 +39,7 @@ devtools::install_github("stmcg/learner")
 ## Example
 
 We illustrate an example of how `learner` can be used. We first load the package.
-```{r}
+```{r, results="hide", message=FALSE}
 library(learner)
 ```
 
diff --git a/README_files/figure-gfm/unnamed-chunk-7-1.png b/README_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..360e568
Binary files /dev/null and b/README_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/man/cv.learner.Rd b/man/cv.learner.Rd
index b0950c4..8c0bd0c 100644
--- a/man/cv.learner.Rd
+++ b/man/cv.learner.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cv_learner.R
+% Please edit documentation in R/learner.R
 \name{cv.learner}
 \alias{cv.learner}
 \title{Cross-validation for LEARNER}
@@ -31,7 +31,7 @@ cv.learner(
 
 \item{n_folds}{an integer specify the number of cross-validation folds. The default is \code{4}.}
 
-\item{n_cores}{an integer specifying the number of CPU cores in parallelization. Parallelization is performed across the different candidate \eqn{(\lambda_1, \lambda_2)} pairs. The default is \code{1}, i.e., no parallelization.}
+\item{n_cores}{an integer specifying the number of CPU cores in OpenMP parallelization. Parallelization is performed across the different candidate \eqn{(\lambda_1, \lambda_2)} pairs. The default is \code{1}, i.e., no parallelization.}
 
 \item{control}{a list of parameters for controlling the stopping criteria for the numerical optimization algorithm, as in \code{\link{learner}}.}
 }
@@ -56,6 +56,7 @@ res <- cv.learner(Y_source = dat_highsim$Y_source,
                   step_size = 0.003)
 
 
+
 }
 \references{
 McGrath, S., Zhu, C,. Guo, M. and Duan, R. (2024). \emph{LEARNER: A transfer learning method for low-rank matrix estimation}. arXiv preprint	arXiv:2412.20605.
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..22034c4
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/src/Makevars b/src/Makevars
new file mode 100644
index 0000000..a90b0d4
--- /dev/null
+++ b/src/Makevars
@@ -0,0 +1,3 @@
+PKG_CXXFLAGS = $(SHLIB_CXXFLAGS) $(SHLIB_OPENMP_CXXFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
+
diff --git a/src/Makevars.win b/src/Makevars.win
new file mode 100644
index 0000000..a90b0d4
--- /dev/null
+++ b/src/Makevars.win
@@ -0,0 +1,3 @@
+PKG_CXXFLAGS = $(SHLIB_CXXFLAGS) $(SHLIB_OPENMP_CXXFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
new file mode 100644
index 0000000..f39d9a3
--- /dev/null
+++ b/src/RcppExports.cpp
@@ -0,0 +1,76 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <RcppEigen.h>
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
+// learner_cpp
+List learner_cpp(const Eigen::MatrixXd& Y_source, const Eigen::MatrixXd& Y_target, int r, double lambda1, double lambda2, double step_size, int max_iter, double threshold, double max_value);
+RcppExport SEXP _learner_learner_cpp(SEXP Y_sourceSEXP, SEXP Y_targetSEXP, SEXP rSEXP, SEXP lambda1SEXP, SEXP lambda2SEXP, SEXP step_sizeSEXP, SEXP max_iterSEXP, SEXP thresholdSEXP, SEXP max_valueSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type Y_source(Y_sourceSEXP);
+    Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type Y_target(Y_targetSEXP);
+    Rcpp::traits::input_parameter< int >::type r(rSEXP);
+    Rcpp::traits::input_parameter< double >::type lambda1(lambda1SEXP);
+    Rcpp::traits::input_parameter< double >::type lambda2(lambda2SEXP);
+    Rcpp::traits::input_parameter< double >::type step_size(step_sizeSEXP);
+    Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
+    Rcpp::traits::input_parameter< double >::type threshold(thresholdSEXP);
+    Rcpp::traits::input_parameter< double >::type max_value(max_valueSEXP);
+    rcpp_result_gen = Rcpp::wrap(learner_cpp(Y_source, Y_target, r, lambda1, lambda2, step_size, max_iter, threshold, max_value));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cv_learner_cpp
+List cv_learner_cpp(const Eigen::MatrixXd& Y_source, const Eigen::MatrixXd& Y_target, const std::vector<double>& lambda1_all, const std::vector<double>& lambda2_all, double step_size, int n_folds, int max_iter, double threshold, int n_cores, int r, double max_value, const std::vector<std::vector<int>>& index_set);
+RcppExport SEXP _learner_cv_learner_cpp(SEXP Y_sourceSEXP, SEXP Y_targetSEXP, SEXP lambda1_allSEXP, SEXP lambda2_allSEXP, SEXP step_sizeSEXP, SEXP n_foldsSEXP, SEXP max_iterSEXP, SEXP thresholdSEXP, SEXP n_coresSEXP, SEXP rSEXP, SEXP max_valueSEXP, SEXP index_setSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type Y_source(Y_sourceSEXP);
+    Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type Y_target(Y_targetSEXP);
+    Rcpp::traits::input_parameter< const std::vector<double>& >::type lambda1_all(lambda1_allSEXP);
+    Rcpp::traits::input_parameter< const std::vector<double>& >::type lambda2_all(lambda2_allSEXP);
+    Rcpp::traits::input_parameter< double >::type step_size(step_sizeSEXP);
+    Rcpp::traits::input_parameter< int >::type n_folds(n_foldsSEXP);
+    Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
+    Rcpp::traits::input_parameter< double >::type threshold(thresholdSEXP);
+    Rcpp::traits::input_parameter< int >::type n_cores(n_coresSEXP);
+    Rcpp::traits::input_parameter< int >::type r(rSEXP);
+    Rcpp::traits::input_parameter< double >::type max_value(max_valueSEXP);
+    Rcpp::traits::input_parameter< const std::vector<std::vector<int>>& >::type index_set(index_setSEXP);
+    rcpp_result_gen = Rcpp::wrap(cv_learner_cpp(Y_source, Y_target, lambda1_all, lambda2_all, step_size, n_folds, max_iter, threshold, n_cores, r, max_value, index_set));
+    return rcpp_result_gen;
+END_RCPP
+}
+// omp_max_threads
+int omp_max_threads();
+RcppExport SEXP _learner_omp_max_threads() {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(omp_max_threads());
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_learner_learner_cpp", (DL_FUNC) &_learner_learner_cpp, 9},
+    {"_learner_cv_learner_cpp", (DL_FUNC) &_learner_cv_learner_cpp, 12},
+    {"_learner_omp_max_threads", (DL_FUNC) &_learner_omp_max_threads, 0},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_learner(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/kernels.cpp b/src/kernels.cpp
new file mode 100644
index 0000000..dbf2ed3
--- /dev/null
+++ b/src/kernels.cpp
@@ -0,0 +1,204 @@
+#include <RcppEigen.h>
+#include <cmath>
+#include <limits>
+#include <algorithm>
+#include <numeric>
+#include <random>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+using namespace Rcpp;
+using namespace Eigen;
+
+//--------------------------------------------------------------
+// Internal worker function: computes the learner estimate - thread safe for OMP
+List learner_worker(const Eigen::MatrixXd &Y_source,
+                               const Eigen::MatrixXd &Y_target,
+                               int r, double lambda1, double lambda2,
+                               double step_size, int max_iter, double threshold,
+                               double max_value) {
+
+  Eigen::BDCSVD<Eigen::MatrixXd> svd(Y_source, Eigen::ComputeThinU | Eigen::ComputeThinV);
+  Eigen::MatrixXd U_full = svd.matrixU();
+  Eigen::MatrixXd V_full = svd.matrixV();
+  int r_use = std::min(r, static_cast<int>(U_full.cols()));
+  Eigen::MatrixXd U_trunc = U_full.leftCols(r_use);
+  Eigen::MatrixXd V_trunc = V_full.leftCols(r_use);
+  Eigen::VectorXd singular_vals = svd.singularValues().head(r_use);
+
+  // Initialize as in R: U_init = U_trunc * sqrt(D), V_init = V_trunc * sqrt(D)
+  Eigen::MatrixXd sqrtD = singular_vals.array().sqrt().matrix().asDiagonal();
+  Eigen::MatrixXd U = U_trunc * sqrtD;
+  Eigen::MatrixXd V = V_trunc * sqrtD;
+
+  // Precompute reusable matrices
+  Eigen::MatrixXd U_trunc_T = U_trunc.transpose();
+  Eigen::MatrixXd V_trunc_T = V_trunc.transpose();
+
+  double perc_nonmissing = 1.0 - (static_cast<double>((Y_target.array().isNaN()).count()) / Y_target.size());
+  bool missing = Y_target.hasNaN();
+
+  double obj_init = 0.0;
+  {
+    Eigen::MatrixXd diff = U * V.transpose() - Y_target;
+    if (missing) {
+      diff = diff.array().isNaN().select(Eigen::MatrixXd::Zero(diff.rows(), diff.cols()), diff);
+    }
+    obj_init = diff.squaredNorm() / perc_nonmissing +
+      lambda1 * (U - U_trunc * (U_trunc_T * U)).squaredNorm() +
+      lambda1 * (V - V_trunc * (V_trunc_T * V)).squaredNorm() +
+      lambda2 * (U.transpose() * U - V.transpose() * V).squaredNorm();
+  }
+
+  double obj_best = obj_init;
+  Eigen::MatrixXd U_best = U;
+  Eigen::MatrixXd V_best = V;
+  double U_norm = U.norm();
+  double V_norm = V.norm();
+
+  int convergence_criterion = 2;
+  std::vector<double> obj_values;
+  obj_values.reserve(max_iter);
+  for (int iter = 0; iter < max_iter; ++iter) {
+    Eigen::MatrixXd U_tilde = U.transpose() * U;
+    Eigen::MatrixXd V_tilde = V.transpose() * V;
+
+    Eigen::MatrixXd adjusted_theta = Y_target;
+    if (missing) {
+      Eigen::MatrixXd temp = U * V.transpose();
+      adjusted_theta = adjusted_theta.array().isNaN().select(temp, adjusted_theta);
+    }
+
+    Eigen::MatrixXd grad_U = (2.0 / perc_nonmissing) * (U * V_tilde - adjusted_theta * V)
+      + lambda1 * 2 * (U - U_trunc * (U_trunc_T * U))
+    + lambda2 * 4 * U * (U_tilde - V_tilde);
+    Eigen::MatrixXd grad_V = (2.0 / perc_nonmissing) * (V * U_tilde - adjusted_theta.transpose() * U)
+      + lambda1 * 2 * (V - V_trunc * (V_trunc_T * V))
+    + lambda2 * 4 * V * (V_tilde - U_tilde);
+
+    double grad_U_norm = grad_U.norm();
+    double grad_V_norm = grad_V.norm();
+
+    U = U - (step_size * U_norm / (grad_U_norm + 1e-12)) * grad_U;
+    V = V - (step_size * V_norm / (grad_V_norm + 1e-12)) * grad_V;
+    U_norm = U.norm();
+    V_norm = V.norm();
+
+    double obj = 0.0;
+    {
+      Eigen::MatrixXd diff = U * V.transpose() - Y_target;
+      if (missing) {
+        diff = diff.array().isNaN().select(Eigen::MatrixXd::Zero(diff.rows(), diff.cols()), diff);
+      }
+      obj = diff.squaredNorm() / perc_nonmissing +
+        lambda1 * (U - U_trunc * (U_trunc_T * U)).squaredNorm() +
+        lambda1 * (V - V_trunc * (V_trunc_T * V)).squaredNorm() +
+        lambda2 * (U.transpose() * U - V.transpose() * V).squaredNorm();
+    }
+    obj_values.push_back(obj);
+
+    if (obj < obj_best) {
+      obj_best = obj;
+      U_best = U;
+      V_best = V;
+    }
+
+    // Checking for convergence conditions
+    if (iter > 0 && std::abs(obj - obj_values[iter - 1]) < threshold) {
+      convergence_criterion = 1;
+      break;
+    }
+    if (iter > 0 && obj > max_value * obj_init) {
+      convergence_criterion = 3;
+      break;
+    }
+    obj_init = obj;
+  }
+
+  return List::create(
+    Named("learner_estimate") = U_best * V_best.transpose(),
+    Named("objective_values") = obj_values,
+    Named("convergence_criterion") = convergence_criterion
+  );
+}
+
+//--------------------------------------------------------------
+// Exported learner function.
+// [[Rcpp::export]]
+List learner_cpp(const Eigen::MatrixXd &Y_source, const Eigen::MatrixXd &Y_target,
+                 int r, double lambda1, double lambda2, double step_size, int max_iter, double threshold, double max_value) {
+  List out = learner_worker(Y_source, Y_target, r, lambda1, lambda2, step_size, max_iter, threshold, max_value);
+  return List::create(
+    Named("learner_estimate") = out["learner_estimate"],
+    Named("objective_values") = out["objective_values"],
+    Named("convergence_criterion") = out["convergence_criterion"],
+    Named("r") = r
+  );
+}
+
+//--------------------------------------------------------------
+// [[Rcpp::export]]
+List cv_learner_cpp(const Eigen::MatrixXd &Y_source, const Eigen::MatrixXd &Y_target,
+                    const std::vector<double> &lambda1_all, const std::vector<double> &lambda2_all,
+                    double step_size, int n_folds, int max_iter, double threshold,
+                    int n_cores, int r, double max_value,
+                    const std::vector<std::vector<int>> &index_set) {
+  int p = Y_source.rows();
+  int q = Y_source.cols();
+  int n_lambda1 = lambda1_all.size();
+  int n_lambda2 = lambda2_all.size();
+
+  Eigen::MatrixXd mse_all = Eigen::MatrixXd::Zero(n_lambda1, n_lambda2);
+
+#pragma omp parallel for collapse(2) schedule(dynamic) num_threads(n_cores)
+  for (int i = 0; i < n_lambda1; ++i) {
+    for (int j = 0; j < n_lambda2; ++j) {
+      double mse = 0.0;
+
+      for (int fold = 0; fold < n_folds; ++fold) {
+        Eigen::MatrixXd Y_train = Y_target;
+        for (int idx : index_set[fold]) {
+          int row = idx % p;
+          int col = idx / p;
+          Y_train(row, col) = std::numeric_limits<double>::quiet_NaN();
+        }
+        List temp = learner_worker(Y_source, Y_train, r,
+                                                          lambda1_all[i], lambda2_all[j],
+                                                                                     step_size, max_iter, threshold, max_value);
+        Eigen::MatrixXd learner_estimate = temp["learner_estimate"];
+        double fold_mse = 0.0;
+        for (int idx : index_set[fold]) {
+          int row = idx % p;
+          int col = idx / p;
+          double diff = learner_estimate(row, col) - Y_target(row, col);
+          fold_mse += diff * diff;
+        }
+        mse += fold_mse;
+      }
+      mse_all(i, j) = mse;
+    }
+  }
+
+  Eigen::Index minRow, minCol;
+  mse_all.minCoeff(&minRow, &minCol);
+
+  return List::create(
+    Named("lambda_1_min") = lambda1_all[minRow],
+                                      Named("lambda_2_min") = lambda2_all[minCol],
+                                                                        Named("mse_all") = mse_all,
+                                                                        Named("r") = r
+  );
+}
+
+// --------------------------------------------------------------
+// [[Rcpp::export]]
+int omp_max_threads() {
+#ifdef _OPENMP
+  return omp_get_max_threads();
+#else
+  return 1;
+#endif
+}
+
diff --git a/src/learner-package.cpp b/src/learner-package.cpp
new file mode 100644
index 0000000..540847e
--- /dev/null
+++ b/src/learner-package.cpp
@@ -0,0 +1,2 @@
+#include <Rcpp.h>
+using namespace Rcpp;
diff --git a/tests/testthat/test-main.R b/tests/testthat/test-main.R
index 2de022f..7cf7bcb 100644
--- a/tests/testthat/test-main.R
+++ b/tests/testthat/test-main.R
@@ -17,6 +17,19 @@ test_that("cv.learner (parallel) does not fail", {
                n_cores = 2))
 })
 
+test_that("cv.learner output is correct", {
+  expected_result <- t(matrix(c(5494.916, 5312.724,
+                                5341.662, 5233.828,
+                                5009.216, 5013.827), nrow = 2, ncol = 3))
+  set.seed(1234)
+  result <- cv.learner(Y_source = dat_highsim$Y_source,
+                       Y_target = dat_highsim$Y_target,
+                       lambda_1_all = c(1, 10, 100),
+                       lambda_2_all = c(1, 10),
+                       step_size = 0.003)
+  expect_equal(result$mse, expected_result, tolerance = 1e-3)
+})
+
 test_that("learner does not fail", {
   expect_no_error(
     learner(Y_source = dat_highsim$Y_source,
@@ -25,12 +38,36 @@ test_that("learner does not fail", {
             step_size = 0.003))
 })
 
+test_that("learner output is correct", {
+  expected_result <- t(matrix(c(0.1578405, -1.54883716,  1.2205437,  0.03377010,  0.0767873,
+                                0.1189457, -0.12793612, -0.9755801,  0.01372606, -0.3902793,
+                                -0.4384613,  1.21518704, -1.0718671,  0.60388502, -0.4104101,
+                                1.4426139, -3.13260948,  0.9175863, -1.79041482,  0.5015381,
+                                0.5602868, -0.05068081, -1.3878535, -0.78502749, -0.1315952), nrow = 5, ncol = 5))
+  result <- learner(Y_source = dat_highsim$Y_source,
+                    Y_target = dat_highsim$Y_target,
+                    lambda_1 = 1, lambda_2 = 1,
+                    step_size = 0.003)
+  expect_equal(result$learner_estimate[1:5, 1:5], expected_result, tolerance = 1e-5)
+})
+
 test_that("dlearner does not fail", {
   expect_no_error(
     dlearner(Y_source = dat_highsim$Y_source,
              Y_target = dat_highsim$Y_target))
 })
 
+test_that("dlearner output is correct", {
+  expected_result <- t(matrix(c(0.0959171, -1.72143637,  1.15500149,  0.005478273,  0.3258085,
+                                0.1077137, -0.32243480, -1.03557177, -0.108195636, -0.4093049,
+                                -0.7237275,  0.86634922, -0.43528206,  1.105802194, -0.5961351,
+                                1.6347696, -2.91985925,  0.04876288, -2.357504089,  0.8127403,
+                                0.4757450,  0.05665543, -1.50373470, -0.744076401, -0.2876033), nrow = 5, ncol = 5))
+  result <- dlearner(Y_source = dat_highsim$Y_source,
+                    Y_target = dat_highsim$Y_target)
+  expect_equal(result$dlearner_estimate[1:5, 1:5], expected_result, tolerance = 1e-5)
+})
+
 
 # Missing data cases
 set.seed(1234)
@@ -45,6 +82,19 @@ test_that("learner does not fail with missing data", {
             step_size = 0.003))
 })
 
+test_that("learner output is correct with missing data", {
+  expected_result <- t(matrix(c(0.1402368, -1.53788299,  1.294196,  0.06241446,  0.1034029,
+                            0.2234374, -0.16286860, -1.017956, -0.13353267, -0.3327750,
+                            -0.4183411,  1.20040593, -1.115950,  0.59002710, -0.4299424,
+                            1.4181535, -3.09632714,  1.011979, -1.75238013,  0.5554817,
+                            0.5603679, -0.06539629, -1.381598, -0.75764518, -0.1373639), nrow = 5, ncol = 5))
+  result <- learner(Y_source = dat_highsim$Y_source,
+                    Y_target = Y_target_highsim_missing,
+                    lambda_1 = 1, lambda_2 = 1,
+                    step_size = 0.003)
+  expect_equal(result$learner_estimate[1:5, 1:5], expected_result, tolerance = 1e-5)
+})
+
 test_that("cv.learner does not fail with missing data", {
   expect_no_error(
     cv.learner(Y_source = dat_highsim$Y_source,
@@ -53,3 +103,16 @@ test_that("cv.learner does not fail with missing data", {
                lambda_2_all = c(1, 10),
                step_size = 0.003))
 })
+
+test_that("cv.learner output is correct with missing data", {
+  expected_result <- t(matrix(c(5457.513, 5254.740,
+                                5291.389, 5175.707,
+                                4933.222, 4947.356), nrow = 2, ncol = 3))
+  set.seed(1234)
+  result <- cv.learner(Y_source = dat_highsim$Y_source,
+                       Y_target = Y_target_highsim_missing,
+                       lambda_1_all = c(1, 10, 100),
+                       lambda_2_all = c(1, 10),
+                       step_size = 0.003)
+  expect_equal(result$mse, expected_result, tolerance = 1e-3)
+})