How to maintain data consistency between R and Rcpp?

Question

I'm doing matrix calculations, and in order to improve performance, I'm programming this process using Rcpp. However, I found that the results of the two calculations could not be consistent because of the accuracy problem.

In order to reduce the memory footprint, I use big.memory to save data to the hard disk. To speed up the computation, use openmpi to speed up the for loop. The intermediate calculation involves only two steps, one is the Hadamard product operation and the other is the normal matrix multiplication. The data type I used was double, and according to the IEEE 754 standard, the number of decimal places after the decimal point should be the same in R and Rcpp. I don't know what went wrong.

I found that this inconsistency was not due to R and Rcpp passing, when I read TRAN (where it is decimal) matrices using R in Linux, the data was already inconsistent, executing the code on Windows would be consistent. Even though I tried to calculate with R in Linux from the very raw data, the resulting data was still not what I expected.

Code in Rcpp:

// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(bigmemory)]]

#include <RcppArmadillo.h>
// #include <Rcpp.h>
#include <bigmemory/MatrixAccessor.hpp>
#include <omp.h>

using namespace std;
using namespace Rcpp;
// using namespace arma;
// using namespace Eigen;

// function for convert R matrix class to arma::mat
arma::mat rMatrixToArmaMat(const Rcpp::NumericMatrix& rMat) {
  return as<arma::mat>(rMat);
}

// [[Rcpp::export]]
arma::mat convertRMatrixToArmaMat(Rcpp::NumericMatrix rMat) {
  return rMatrixToArmaMat(rMat);
}

// Logic for ca_epi_data
template <typename T>
void ca_epi_data(Rcpp::XPtr<BigMatrix> epi_data,
                                  const Rcpp::XPtr<BigMatrix> combos,
                                  const arma::mat& epi1,
                                  const arma::mat& epi2,
                                  const arma::mat& TRAN,
                                  const NumericMatrix& Y,
                                  const NumericMatrix& trdata1,
                                  const NumericMatrix& trdata2,
                                  int threads=0) {
  omp_set_num_threads(threads);
  
  MatrixAccessor<T> epi_col = MatrixAccessor<T>(*epi_data);
  MatrixAccessor<int> combos_col = MatrixAccessor<int>(*combos);
  
  // Check dimensions
  if (epi_data->nrow() != TRAN.n_rows) {
    Rcpp::stop("Mismatch between TRAN rows and epi_data rows.");
  }
  if (epi_data->ncol() != combos->ncol()+1+2*trdata1.ncol()) {
    Rcpp::stop("Mismatch between combos columns and epi_data columns.");
  }
  
  // copy Y, trdata1 and trdata2 to epi_data
  #pragma omp parallel for
  for (size_t j = 0; j < Y.ncol(); j++) {
    if (j >= Y.ncol()) {
      Rcpp::stop("Col index exceeds Y cols.");
    }
    for (size_t k = 0; k < Y.nrow(); k++){
      if (k >= Y.nrow()) {
        Rcpp::stop("Row index exceeds trdata1 rows.");
      }
      // epi_col[i] is pointer,epi_col[i][j] is value.
      epi_col[j][k] = Y(k,j);
    }
  }
  #pragma omp parallel for
  for (size_t j = 0; j < trdata1.ncol(); j++) {
    if (j >= trdata1.ncol()) {
      Rcpp::stop("Col index exceeds trdata1 cols.");
    }
    for (size_t k = 0; k < trdata1.nrow(); k++){
      if (k >= trdata1.nrow()) {
        Rcpp::stop("Row index exceeds trdata1 rows.");
      }
      // epi_col[i] is pointer,epi_col[i][j] is value.
      epi_col[j+1][k] = trdata1(k,j);
    }
  }
  #pragma omp parallel for
  for (size_t j = 0; j < trdata2.ncol(); j++) {
    if (j >= trdata2.ncol()) {
      Rcpp::stop("Col index exceeds trdata1 cols.");
    }
    for (size_t k = 0; k < trdata2.nrow(); k++){
      if (k >= trdata2.nrow()) {
        Rcpp::stop("Row index exceeds trdata2 rows.");
      }
      // epi_col[i] is pointer,epi_col[i][j] is value.
      epi_col[j+1+trdata1.ncol()][k] = trdata2(k,j);
    }
  }

  #pragma omp parallel for
  for(size_t i=0; i < combos->ncol();i++) {
    arma::colvec col_epi1 = epi1.col(combos_col[i][0]-1);
    arma::colvec col_epi2 = epi2.col(combos_col[i][1]-1);
    
    // calculate hadamard product
    arma::colvec hadamard_product = col_epi1 % col_epi2;
    
    // matrix *
    arma::colvec result = TRAN * hadamard_product;
      
    // apply the result to big.matrix
    if (result.n_elem != epi_data->nrow()) {
       Rcpp::stop("Result vector length does not match epi_data row count.");
    }
    for (size_t j = 0; j < result.n_elem; j++) {
       if (j >= epi_data->nrow()) {
         Rcpp::stop("Row index exceeds epi_data rows.");
       }
       // epi_col[i] is pointer,epi_col[i][j] is value.
       epi_col[i+Y.ncol()+trdata1.ncol()+trdata2.ncol()][j] = result[j];
  }
     // #pragma omp critical
     // {
    //   Rcpp::Rcout << "Processed column " << i << " of " << m << std::endl;
     // }
  }
  // return epi_data;
}
  
// [[Rcpp::export]]
void ca_epi_data(SEXP pEpi_data,
                 SEXP pCombos,
                 arma::mat& epi1,
                 arma::mat& epi2,
                 arma::mat& TRAN,
                 NumericMatrix& Y,
                 NumericMatrix& trdata1,
                 NumericMatrix& trdata2,
                 int threads=0){
  XPtr<BigMatrix> xpEpi_data(pEpi_data);
  XPtr<BigMatrix> xpCombos(pCombos);
  if(4 != xpCombos->matrix_type()){
    Rcpp::stop("big.matrix object of pCombos should be int type");
  }
  
  switch(xpEpi_data->matrix_type()) {
  case 1:
    return ca_epi_data<char>(xpEpi_data, xpCombos, epi1, epi2, TRAN,
                             Y, trdata1, trdata2, threads);
  case 2:
    return ca_epi_data<short>(xpEpi_data, xpCombos, epi1, epi2, TRAN,
                              Y, trdata1, trdata2, threads);
  case 4:
    return ca_epi_data<int>(xpEpi_data, xpCombos, epi1, epi2, TRAN,
                            Y, trdata1, trdata2, threads);
  case 8:
    return ca_epi_data<double>(xpEpi_data, xpCombos, epi1, epi2, TRAN,
                               Y, trdata1, trdata2, threads);
  default:
    throw Rcpp::exception("unknown type detected for big.matrix object!");
  }
}

Call in `R`:

library(Rcpp)
library(bigmemory)

# load data
trait = "PH"
load(file = "Adata.RData")
load(file = "Ddata.RData")
load(paste0("trAdata_",trait,".RData"))
load(paste0("trDdata_",trait,".RData"))
load(paste0("TRAN_",trait,".RData"))
load(paste0("Y_",trait,".RData"))
load(file = "./total_ma_names.rdata")
nmar=500
TRAN = TRAN
eff_type = "ad"
epi1 = Adata
epi2 = Ddata
trdata1 = trAdata
trdata2 = trDdata

epi_data <- filebacked.big.matrix(
  nrow = nrow(TRAN),
  ncol = ncol(combos)+2*nmar+ncol(Y),
  type = "double", # char是c++的单个字符
  backingfile = "epi_data.bin",
  backingpath = dirname("epi_data"),
  descriptorfile = "epi_data.des",
  dimnames = c(NULL, NULL)
)

# source Cpp file
sourceCpp(file = "/mnt/f/07-CAUS/01-Linux-service/project_18DH-heterosis/Analysis/31-gwas_mph/test/code_for_epi/source/epi.cpp")

# convert R matrix to arma::mat
epi1 <- convertRMatrixToArmaMat(epi1)
epi2 <- convertRMatrixToArmaMat(epi2)
TRAN <- convertRMatrixToArmaMat(TRAN)

# generate a new matrix
tryCatch({
  ca_epi_data(epi_data@address, combos@address, epi1, epi2, TRAN, 
              Y, trdata1,trdata2, threads = 1)
}, error = function(e) {
  print(e)
})

The results of `R` and `Rcpp` are compared.

The resulting matrix should consist of two columns that are negative to each other. However, in the Rcpp, although the results appear to be the same, the operation + can show that they are slightly different. This affects my subsequent matrix qr decomposition, and I can't get a real rank.

Result of `r`:

>epidesign <- TRAN%*%(epi1[,m]*epi2[,n])
>subdata <- data.frame(y=Y,trdata1[,m],trdata2[,n],epi=epidesign)
> head(subdata[,c(3,4)])
  S3_229468875         epi
1  -0.19933658  0.19933658
2   0.15319562 -0.15319562
3   0.11634120 -0.11634120
4   0.14881466 -0.14881466
5   0.07223461 -0.07223461
6   0.19493439 -0.19493439

> subdata[,4]+subdata[,3]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [58] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[115] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[172] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[229] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[286] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[343] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[400] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[457] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Result of `Rcpp` matrix operation:

> epi_data[,c(682,3661)]
                [,1]          [,2]
  [1,] -0.1993365796  0.1993365796
  [2,]  0.1531956224 -0.1531956224
  [3,]  0.1163411954 -0.1163411954
  [4,]  0.1488146626 -0.1488146626
  [5,]  0.0722346106 -0.0722346106
  [6,]  0.1949343899 -0.1949343899
  [7,]  0.2105010152 -0.2105010152
  [8,] -0.2105474946  0.2105474946
  [9,] -0.1859160503  0.1859160503
 [10,] -0.0950061868  0.0950061868

> epi_data[,682]+epi_data[,3661]
  [1] -2.775558e-17  2.775558e-17  5.551115e-17  2.775558e-17  0.000000e+00  5.551115e-17
  [7]  2.775558e-17  0.000000e+00 -5.551115e-17  2.775558e-17  0.000000e+00  0.000000e+00
 [13]  1.387779e-17 -6.938894e-17  2.775558e-17  1.110223e-16 -1.387779e-17 -2.775558e-17
 [19]  3.469447e-18 -1.387779e-17  2.775558e-17  1.110223e-16 -2.220446e-16 -2.775558e-17
 [25]  2.775558e-17  6.938894e-18 -3.469447e-17  8.326673e-17  0.000000e+00 -2.775558e-17
 [31]  1.110223e-16  2.775558e-17  1.110223e-16  0.000000e+00  2.775558e-17  6.938894e-18
 [37] -2.775558e-16  2.775558e-17  5.551115e-17  0.000000e+00 -2.775558e-17  0.000000e+00

Reasonable question, but surely for any real world calculation rounding to the nearest quadrillionth would be fine? — Allan Cameron
– Allan Cameron, Commented Nov 24, 2024 at 9:17
I don't know exactly how many bits before the accuracy is consistent, that will be another matter for discussion — G WOLF
– G WOLF, Commented Nov 24, 2024 at 10:02
The differences you calculated mean that the binary representations differ in the last digit of the mantissa. 2.775558e-17=2^-55 . The mantissa has 52 explicit bits plus the implicit leading 1. The first value .199 is 1.1...*2^-3 in binary. Given the 2^-3 factor, 2^-55 is the 52th digit of this mantissa. — Joachim
– Joachim, Commented Nov 24, 2024 at 10:50
Also, as so often, R FAQ 7.31 "Why doesn’t R think these numbers are equal? and the references therein. But @Joachim already brought the bacon home on this and gave you the gist. — Ada Lovelace
– Ada Lovelace, Commented Nov 24, 2024 at 18:58

Collectives™ on Stack Overflow

How to maintain data consistency between R and Rcpp?

Code in Rcpp:

Call in `R`:

The results of `R` and `Rcpp` are compared.

Result of `r`:

Result of `Rcpp` matrix operation:

0

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

Code in Rcpp:

Call in R:

The results of R and Rcpp are compared.

Result of r:

Result of Rcpp matrix operation:

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Call in `R`:

The results of `R` and `Rcpp` are compared.

Result of `r`:

Result of `Rcpp` matrix operation: