diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml
index 4f567086f..6528a4b98 100644
--- a/.github/workflows/oap-mllib-ci.yml
+++ b/.github/workflows/oap-mllib-ci.yml
@@ -27,5 +27,5 @@ jobs:
         run: |          
           source ${{github.workspace}}/dev/setup-all.sh
       - name: Build and Test
-        run: |          
+        run: |
           ${{github.workspace}}/dev/ci-test.sh
diff --git a/dev/ci-test.sh b/dev/ci-test.sh
index 7b5a1939c..e8e51b1ca 100755
--- a/dev/ci-test.sh
+++ b/dev/ci-test.sh
@@ -54,8 +54,9 @@ for SparkVer in ${SupportedSparkVersions[*]}; do
 
     mvn --no-transfer-progress -P$SparkVer -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
     mvn --no-transfer-progress -P$SparkVer -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
-    # mvn -P$SparkVer -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
+    # mvn --no-transfer-progress -P$SparkVer -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
 done
 
 # Yarn cluster test without profile
-$GITHUB_WORKSPACE/dev/test-cluster/ci-test-cluster.sh
\ No newline at end of file
+$GITHUB_WORKSPACE/dev/ci-build.sh
+$GITHUB_WORKSPACE/dev/test-cluster/ci-test-cluster.sh
diff --git a/dev/codestyle/lint-scala.sh b/dev/codestyle/lint-scala.sh
index e18a731b4..2d947dda8 100755
--- a/dev/codestyle/lint-scala.sh
+++ b/dev/codestyle/lint-scala.sh
@@ -23,7 +23,7 @@ if [ -z $MVN ]; then
     exit 1
 fi
 
-ERRORS=$($MVN scalastyle:check | grep error)
+ERRORS=$($MVN scalastyle:check | grep "error file")
 
 if test ! -z "$ERRORS"; then
     echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS"
diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml
index 055308cd6..d37ff879c 100644
--- a/mllib-dal/pom.xml
+++ b/mllib-dal/pom.xml
@@ -1,5 +1,5 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xmlns="http://maven.apache.org/POM/4.0.0"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
 
@@ -273,7 +273,7 @@
                     <testSourceDirectories>
                         <directory>${basedir}/src/test/java</directory>
                     </testSourceDirectories>
-                        <configLocation>${basedir}/../dev/codestyle/checkstyle.xml</configLocation>
+                    <configLocation>${basedir}/../dev/codestyle/checkstyle.xml</configLocation>
                     <outputFile>${basedir}/target/checkstyle-output.xml</outputFile>
                 </configuration>
                 <dependencies>
@@ -303,7 +303,7 @@
                     <failOnViolation>true</failOnViolation>
                     <includeTestSourceDirectory>false</includeTestSourceDirectory>
                     <failOnWarning>false</failOnWarning>
-                    <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
+                    <sourceDirectory>${basedir}/src/main/scala/org/apache/spark</sourceDirectory>
                     <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
                     <configLocation>scalastyle-config.xml</configLocation>
                     <outputFile>${basedir}/target/scalastyle-output.xml</outputFile>
@@ -344,7 +344,7 @@
                         </goals>
                     </execution>
                 </executions>
-            </plugin>           
+            </plugin>
 
             <plugin>
                 <artifactId>maven-antrun-plugin</artifactId>
@@ -354,8 +354,9 @@
                         <phase>process-classes</phase>
                         <configuration>
                             <target name="build-natives" description="Build native code">
-                                <echo>Building native code</echo>                                
-                                <exec dir="${project.basedir}/src/main/native" executable="./build.sh" failonerror="true">
+                                <echo>Building native code</echo>
+                                <exec dir="${project.basedir}/src/main/native" executable="./build.sh"
+                                      failonerror="true">
                                 </exec>
                             </target>
                         </configuration>
@@ -375,11 +376,11 @@
                         <resource>
                             <directory>${env.CCL_ROOT}/lib</directory>
                             <includes>
-				                <include>${ccl.lib}</include>
+                                <include>${ccl.lib}</include>
                                 <include>${ccl.mpi.lib}</include>
                                 <include>${ccl.fabric.lib}</include>
                             </includes>
-                        </resource>                        
+                        </resource>
                         <resource>
                             <directory>${env.CCL_ROOT}/lib/prov</directory>
                             <includes>
@@ -418,18 +419,21 @@
                         <configuration>
                             <fileSets>
                                 <!-- oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10,
-                                rename to workaround. See https://github.com/oneapi-src/oneDAL/issues/1254 -->                                
+                                rename to workaround. See https://github.com/oneapi-src/oneDAL/issues/1254 -->
                                 <fileSet>
                                     <sourceFile>${project.build.testOutputDirectory}/lib/${tbb.lib}</sourceFile>
-                                    <destinationFile>${project.build.testOutputDirectory}/lib/libtbb.so.2</destinationFile>
+                                    <destinationFile>${project.build.testOutputDirectory}/lib/libtbb.so.2
+                                    </destinationFile>
                                 </fileSet>
                                 <fileSet>
                                     <sourceFile>${project.build.testOutputDirectory}/lib/${tbb.malloc.lib}</sourceFile>
-                                    <destinationFile>${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2</destinationFile>
+                                    <destinationFile>${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2
+                                    </destinationFile>
                                 </fileSet>
                                 <fileSet>
                                     <sourceFile>${project.build.testOutputDirectory}/lib/${ccl.mpi.lib}</sourceFile>
-                                    <destinationFile>${project.build.testOutputDirectory}/lib/libmpi.so.12</destinationFile>
+                                    <destinationFile>${project.build.testOutputDirectory}/lib/libmpi.so.12
+                                    </destinationFile>
                                 </fileSet>
                             </fileSets>
                         </configuration>
diff --git a/mllib-dal/scalastyle-config.xml b/mllib-dal/scalastyle-config.xml
index c1dc57be5..7ddb59629 100644
--- a/mllib-dal/scalastyle-config.xml
+++ b/mllib-dal/scalastyle-config.xml
@@ -49,14 +49,13 @@ This file is divided into 3 sections:
   <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
     <parameters>
        <parameter name="header"><![CDATA[/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -142,7 +141,8 @@ This file is divided into 3 sections:
   </check>
 
   <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
-  <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+  <!-- Disabled for OAP MLlib -->
+  <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
     <parameters><parameter name="regex">^println$</parameter></parameters>
     <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
       // scalastyle:off println
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/clustering/KMeansResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/clustering/KMeansResult.java
index 502b434fa..eb7b0cd1e 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/clustering/KMeansResult.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/clustering/KMeansResult.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
  * Copyright 2020 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,12 +12,11 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- *******************************************************************************/
+ */
 
 package org.apache.spark.ml.clustering;
 
 public class KMeansResult {
-    public long cNumericTable;
-    public int iterationNum;
-    public double totalCost;
+  public int iterationNum;
+  public double totalCost;
 }
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/feature/PCAResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/feature/PCAResult.java
index 626bb905d..5d07c7e21 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/feature/PCAResult.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/feature/PCAResult.java
@@ -1,6 +1,22 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.feature;
 
 public class PCAResult {
-    public long pcNumericTable;
-    public long explainedVarianceNumericTable;
+  public long pcNumericTable;
+  public long explainedVarianceNumericTable;
 }
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
index 5f2561772..793350e79 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
@@ -1,6 +1,22 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.recommendation;
 
 public class ALSPartitionInfo {
-    public int ratingsNum;
-    public int csrRowNum;
+  public int ratingsNum;
+  public int csrRowNum;
 }
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
index 67173a0c5..c4824e896 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
@@ -1,9 +1,25 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.recommendation;
 
 public class ALSResult {
-    public long rankId = -1;
-    public long cUsersFactorsNumTab;
-    public long cItemsFactorsNumTab;
-    public long cUserOffset;
-    public long cItemOffset;
+  public long rankId = -1;
+  public long cUsersFactorsNumTab;
+  public long cItemsFactorsNumTab;
+  public long cUserOffset;
+  public long cItemOffset;
 }
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/CCLParam.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/CCLParam.java
index 9299e3008..38052d4cc 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/CCLParam.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/CCLParam.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
  * Copyright 2020 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,11 +12,11 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- *******************************************************************************/
+ */
 
 package org.apache.spark.ml.util;
 
 class CCLParam {
-    long commSize;
-    long rankId;
-}
\ No newline at end of file
+  long commSize;
+  long rankId;
+}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/DataBatch.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/DataBatch.java
deleted file mode 100644
index 77cc938e9..000000000
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/DataBatch.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2020 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
- 
-package org.apache.spark.ml.util;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * A mini-batch of data that can be converted to DALMatrix.
- */
-public class DataBatch {
-  
-  /** The offset of each rows in the matrix */
-  final long[] rowOffset;
-  /** value of each non-missing entry in the matrix */
-  final double[] values ;
-  /** feature columns */
-  final int numCols;
-
-  DataBatch(long[] rowOffset,
-            double[] values, int numCols) {
-    this.rowOffset = rowOffset;
-    this.values = values;
-    this.numCols = numCols;
-  }
-
-  static public class BatchIterator implements Iterator<DataBatch> {
-    private final Iterator<double[]> base;
-    private final int batchSize;
-
-    public BatchIterator(Iterator<double[]> base, int batchSize) {
-      this.base = base;
-      this.batchSize = batchSize;
-    }
-
-    @Override
-    public boolean hasNext() {
-      return base.hasNext();
-    }
-
-    @Override
-    public DataBatch next() {
-      try {
-        int numRows = 0;
-        int numCols  = -1;
-        List<double[]> batch = new ArrayList<>(batchSize);
-        while (base.hasNext() && batch.size() < batchSize) {
-          double[] curValue = base.next();
-          if (numCols == -1) {
-            numCols = curValue.length;
-          } else if (numCols != curValue.length) {
-            throw new RuntimeException("Feature size is not the same");
-          }
-          batch.add(curValue);
-		  
-          numRows++;
-        }
-
-        long[] rowOffset = new long[numRows];
-        double[] values = new double[numRows * numCols];
-
-        int offset = 0;
-        for (int i = 0; i < batch.size(); i++) {
-          double[] curValue = batch.get(i);
-          rowOffset[i] = i;
-          System.arraycopy(curValue, 0, values, offset,
-            curValue.length);
-          offset += curValue.length;
-        }
-
-        return new DataBatch(rowOffset, values, numCols);
-      } catch (RuntimeException runtimeError) {
-     
-        return null;
-      }
-    }
-
-    @Override
-    public void remove() {
-      throw new UnsupportedOperationException("DataBatch.BatchIterator.remove");
-    }
-  }
-}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
index d8ea09a23..eada9b20c 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
  * Copyright 2020 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,142 +12,141 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- *******************************************************************************/
-
-// Based on oneDAL Java com.intel.daal.utils.libUtils code
+ */
 
 package org.apache.spark.ml.util;
 
-import java.io.*;
-import java.util.UUID;
-import java.util.logging.Level;
+import com.intel.daal.utils.LibUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.intel.daal.utils.LibUtils;
+import java.io.*;
+import java.util.UUID;
 
 public final class LibLoader {
-    private static final String LIBRARY_PATH_IN_JAR = "/lib";
-    // Make sure loading libraries from different temp directory for each process
-    private final static String subDir = "MLlibDAL_" + UUID.randomUUID();
-
-    private static final Logger log = LoggerFactory.getLogger("LibLoader");
-
-    /**
-     * Get temp dir for exacting lib files
-     * @return path of temp dir
-     */
-    public static String getTempSubDir() {
-        String tempSubDirectory = System.getProperty("java.io.tmpdir") + "/" + subDir + LIBRARY_PATH_IN_JAR;
-        return tempSubDirectory;
-    }
-
-
-    /**
-     * Load oneCCL and MLlibDAL libs
-     */
-    public static synchronized void loadLibraries() throws IOException {
-        loadLibCCL();
-        loadLibMLlibDAL();
+  private static final String LIBRARY_PATH_IN_JAR = "/lib";
+  // Make sure loading libraries from different temp directory for each process
+  private static final String subDir = "MLlibDAL_" + UUID.randomUUID();
+
+  private static final Logger log = LoggerFactory.getLogger("LibLoader");
+
+  /**
+   * Get temp dir for exacting lib files
+   *
+   * @return path of temp dir
+   */
+  public static String getTempSubDir() {
+    String tempSubDirectory = System.getProperty("java.io.tmpdir") +
+            "/" + subDir + LIBRARY_PATH_IN_JAR;
+    return tempSubDirectory;
+  }
+
+  /**
+   * Load oneCCL and MLlibDAL libs
+   */
+  public static synchronized void loadLibraries() throws IOException {
+    loadLibCCL();
+    loadLibMLlibDAL();
+  }
+
+  /**
+   * Load oneCCL libs in dependency order
+   */
+  private static synchronized void loadLibCCL() throws IOException {
+    loadFromJar(subDir, "libfabric.so.1");
+    loadFromJar(subDir, "libmpi.so.12");
+    loadFromJar(subDir, "libccl.so");
+    loadFromJar(subDir, "libsockets-fi.so");
+  }
+
+  /**
+   * Load MLlibDAL lib, it depends TBB libs that are loaded by oneDAL, so this
+   * function should be called after oneDAL loadLibrary
+   */
+  private static synchronized void loadLibMLlibDAL() throws IOException {
+    // oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10
+    // Rename in pom.xml and assembly.xml to workaround.
+    // See https://github.com/oneapi-src/oneDAL/issues/1254 -->
+    LibUtils.loadLibrary();
+
+    loadFromJar(subDir, "libMLlibDAL.so");
+  }
+
+  /**
+   * Load lib as resource
+   *
+   * @param path sub folder (in temporary folder) name
+   * @param name library name
+   */
+  private static void loadFromJar(String path, String name) throws IOException {
+    log.debug("Loading " + name + " ...");
+
+    File fileOut = createTempFile(path, name);
+    // File exists already
+    if (fileOut == null) {
+      log.debug("DONE: Loading library as resource.");
+      return;
     }
 
-    /**
-     * Load oneCCL libs in dependency order
-     */
-    private static synchronized void loadLibCCL() throws IOException {
-        loadFromJar(subDir, "libfabric.so.1");
-        loadFromJar(subDir, "libmpi.so.12");
-        loadFromJar(subDir, "libccl.so");
-        loadFromJar(subDir, "libsockets-fi.so");
+    InputStream streamIn = LibLoader.class.getResourceAsStream(LIBRARY_PATH_IN_JAR + "/" + name);
+    if (streamIn == null) {
+      throw new IOException("Error: No resource found.");
     }
 
-    /**
-     * Load MLlibDAL lib, it depends TBB libs that are loaded by oneDAL,
-     * so this function should be called after oneDAL loadLibrary
-     */
-    private static synchronized void loadLibMLlibDAL() throws IOException {
-        // oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10
-        // Rename in pom.xml and assembly.xml to workaround.
-        // See https://github.com/oneapi-src/oneDAL/issues/1254 -->
-        LibUtils.loadLibrary();
-
-        loadFromJar(subDir, "libMLlibDAL.so");
-    }
+    try (OutputStream streamOut = new FileOutputStream(fileOut)) {
+      log.debug("Writing resource to temp file.");
 
-    /**
-     * Load lib as resource
-     *
-     * @param path sub folder (in temporary folder) name
-     * @param name library name
-     */
-    private static void loadFromJar(String path, String name) throws IOException {
-        log.debug("Loading " + name + " ...");
-
-        File fileOut = createTempFile(path, name);
-        // File exists already
-        if (fileOut == null) {
-            log.debug("DONE: Loading library as resource.");
-            return;
+      byte[] buffer = new byte[32768];
+      while (true) {
+        int read = streamIn.read(buffer);
+        if (read < 0) {
+          break;
         }
-
-        InputStream streamIn = LibLoader.class.getResourceAsStream(LIBRARY_PATH_IN_JAR + "/" + name);
-        if (streamIn == null) {
-            throw new IOException("Error: No resource found.");
-        }
-
-        try (OutputStream streamOut = new FileOutputStream(fileOut)) {
-            log.debug("Writing resource to temp file.");
-
-            byte[] buffer = new byte[32768];
-            while (true) {
-                int read = streamIn.read(buffer);
-                if (read < 0) {
-                    break;
-                }
-                streamOut.write(buffer, 0, read);
-            }
-
-            streamOut.flush();
-        } catch (IOException e) {
-            throw new IOException("Error:  I/O error occurs from/to temp file.");
-        } finally {
-            streamIn.close();
-        }
-
-        System.load(fileOut.toString());
-        log.debug("DONE: Loading library as resource.");
+        streamOut.write(buffer, 0, read);
+      }
+
+      streamOut.flush();
+    } catch (IOException e) {
+      throw new IOException("Error:  I/O error occurs from/to temp file.");
+    } finally {
+      streamIn.close();
     }
 
-    /**
-     * Create temporary file
-     *
-     * @param name           library name
-     * @param tempSubDirName sub folder (in temporary folder) name
-     * @return temporary file handler. null if file exist already.
-     */
-    private static File createTempFile(String tempSubDirName, String name) throws IOException {
-        File tempSubDirectory = new File(System.getProperty("java.io.tmpdir") + "/" + tempSubDirName + LIBRARY_PATH_IN_JAR);
-
-        if (!tempSubDirectory.exists()) {
-            tempSubDirectory.mkdirs();
-            // Check existance again, don't use return bool of mkdirs
-            if (!tempSubDirectory.exists()) {
-                throw new IOException("Error: Can`t create folder for temp file.");
-            }
-        }
-
-        String tempFileName = tempSubDirectory + "/" + name;
-        File tempFile = new File(tempFileName);
+    System.load(fileOut.toString());
+    log.debug("DONE: Loading library as resource.");
+  }
+
+  /**
+   * Create temporary file
+   *
+   * @param name           library name
+   * @param tempSubDirName sub folder (in temporary folder) name
+   * @return temporary file handler. null if file exist already.
+   */
+  private static File createTempFile(String tempSubDirName, String name) throws IOException {
+    File tempSubDirectory = new File(
+            System.getProperty("java.io.tmpdir") + "/" + tempSubDirName + LIBRARY_PATH_IN_JAR);
+
+    if (!tempSubDirectory.exists()) {
+      tempSubDirectory.mkdirs();
+      // Check existance again, don't use return bool of mkdirs
+      if (!tempSubDirectory.exists()) {
+        throw new IOException("Error: Can`t create folder for temp file.");
+      }
+    }
 
-        if (tempFile == null) {
-            throw new IOException("Error: Can`t create temp file.");
-        }
+    String tempFileName = tempSubDirectory + "/" + name;
+    File tempFile = new File(tempFileName);
 
-        if (tempFile.exists()) {
-            return null;
-        }
+    if (tempFile == null) {
+      throw new IOException("Error: Can`t create temp file.");
+    }
 
-        return tempFile;
+    if (tempFile.exists()) {
+      return null;
     }
 
+    return tempFile;
+  }
+
 }
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/Service.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/Service.java
index 4a091562a..91456bc96 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/Service.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/Service.java
@@ -1,29 +1,29 @@
-/* file: Service.java */
-/*******************************************************************************
-* Copyright 2014-2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
 /*
- //  Based on oneDAL Java example code
- //  Content:
- //     Auxiliary functions used in Java examples
- ////////////////////////////////////////////////////////////////////////////////
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
+// Based on oneDAL Java example code
+
 package org.apache.spark.ml.util;
 
+import com.intel.daal.data_management.data.CSRNumericTable;
+import com.intel.daal.data_management.data.HomogenNumericTable;
+import com.intel.daal.data_management.data.NumericTable;
+import com.intel.daal.services.DaalContext;
+import com.intel.daal.services.ErrorHandling;
+
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
@@ -32,562 +32,562 @@
 import java.text.DecimalFormat;
 import java.util.ArrayList;
 
-import com.intel.daal.data_management.data.CSRNumericTable;
-import com.intel.daal.data_management.data.HomogenNumericTable;
-import com.intel.daal.data_management.data.NumericTable;
-import com.intel.daal.data_management.data.KeyValueDataCollection;
-import com.intel.daal.data_management.data_source.*;
-import com.intel.daal.services.DaalContext;
-import com.intel.daal.services.ErrorHandling;
-
 public class Service {
-    public static void readRow(String line, int offset, int nCols, double[] data) throws IOException {
-        if (line == null) {
-            throw new IOException("Unable to read input dataset");
-        }
-
-        String[] elements = line.split(",");
-        for (int j = 0; j < nCols; j++) {
-            data[offset + j] = Double.parseDouble(elements[j]);
-        }
+  public static void readRow(String line, int offset, int nCols, double[] data) throws IOException {
+    if (line == null) {
+      throw new IOException("Unable to read input dataset");
     }
 
-    public static void readRow(String line, int offset, int nCols, long[] data) throws IOException {
-        if (line == null) {
-            throw new IOException("Unable to read input dataset");
-        }
-
-        String[] elements = line.split(",");
-        for (int j = 0; j < nCols; j++) {
-            data[offset + j] = Long.parseLong(elements[j]);
-        }
+    String[] elements = line.split(",");
+    for (int j = 0; j < nCols; j++) {
+      data[offset + j] = Double.parseDouble(elements[j]);
     }
+  }
 
-    public static void readRow(String line, int offset, int nCols, float[] data) throws IOException {
-        if (line == null) {
-            throw new IOException("Unable to read input dataset");
-        }
-
-        String[] elements = line.split(",");
-        for (int j = 0; j < nCols; j++) {
-            data[offset + j] = Float.parseFloat(elements[j]);
-        }
+  public static void readRow(String line, int offset, int nCols, long[] data) throws IOException {
+    if (line == null) {
+      throw new IOException("Unable to read input dataset");
     }
 
-    public static void readSparseData(String dataset, int nVectors, int nNonZeroValues, long[] rowOffsets,
-            long[] colIndices, double[] data) {
-        try {
-            BufferedReader bufferedReader = new BufferedReader(new FileReader(dataset));
-            readRow(bufferedReader.readLine(), 0, nVectors + 1, rowOffsets);
-            readRow(bufferedReader.readLine(), 0, nNonZeroValues, colIndices);
-            readRow(bufferedReader.readLine(), 0, nNonZeroValues, data);
-            bufferedReader.close();
-        } catch (IOException e) {
-            ErrorHandling.printThrowable(e);
-        } catch (NumberFormatException e) {
-            ErrorHandling.printThrowable(e);
-        }
+    String[] elements = line.split(",");
+    for (int j = 0; j < nCols; j++) {
+      data[offset + j] = Long.parseLong(elements[j]);
     }
+  }
 
-    private static int getRowLength(String line) {
-        String[] elements = line.split(",");
-        return elements.length;
+  public static void readRow(String line, int offset, int nCols, float[] data) throws IOException {
+    if (line == null) {
+      throw new IOException("Unable to read input dataset");
     }
 
-    public static CSRNumericTable createSparseTable(DaalContext context, String dataset) throws IOException {
-        BufferedReader bufferedReader = new BufferedReader(new FileReader(dataset));
+    String[] elements = line.split(",");
+    for (int j = 0; j < nCols; j++) {
+      data[offset + j] = Float.parseFloat(elements[j]);
+    }
+  }
+
+  public static void readSparseData(String dataset, int nVectors, int nNonZeroValues,
+                                    long[] rowOffsets, long[] colIndices, double[] data) {
+    try {
+      BufferedReader bufferedReader = new BufferedReader(new FileReader(dataset));
+      readRow(bufferedReader.readLine(), 0, nVectors + 1, rowOffsets);
+      readRow(bufferedReader.readLine(), 0, nNonZeroValues, colIndices);
+      readRow(bufferedReader.readLine(), 0, nNonZeroValues, data);
+      bufferedReader.close();
+    } catch (IOException e) {
+      ErrorHandling.printThrowable(e);
+    } catch (NumberFormatException e) {
+      ErrorHandling.printThrowable(e);
+    }
+  }
 
-        String rowIndexLine = bufferedReader.readLine();
-        int nVectors = getRowLength(rowIndexLine);
-        long[] rowOffsets = new long[nVectors];
+  private static int getRowLength(String line) {
+    String[] elements = line.split(",");
+    return elements.length;
+  }
 
-        readRow(rowIndexLine, 0, nVectors, rowOffsets);
-        nVectors = nVectors - 1;
+  public static CSRNumericTable createSparseTable(DaalContext context,
+                                                  String dataset) throws IOException {
+    BufferedReader bufferedReader = new BufferedReader(new FileReader(dataset));
 
-        String columnsLine = bufferedReader.readLine();
-        int nCols = getRowLength(columnsLine);
+    String rowIndexLine = bufferedReader.readLine();
+    int nVectors = getRowLength(rowIndexLine);
+    long[] rowOffsets = new long[nVectors];
 
-        long[] colIndices = new long[nCols];
-        readRow(columnsLine, 0, nCols, colIndices);
+    readRow(rowIndexLine, 0, nVectors, rowOffsets);
+    nVectors = nVectors - 1;
 
-        String valuesLine = bufferedReader.readLine();
-        int nNonZeros = getRowLength(valuesLine);
+    String columnsLine = bufferedReader.readLine();
+    int nCols = getRowLength(columnsLine);
 
-        float[] data = new float[nNonZeros];
-        readRow(valuesLine, 0, nNonZeros, data);
+    long[] colIndices = new long[nCols];
+    readRow(columnsLine, 0, nCols, colIndices);
 
-        bufferedReader.close();
+    String valuesLine = bufferedReader.readLine();
+    int nNonZeros = getRowLength(valuesLine);
 
-        long maxCol = 0;
-        for (int i = 0; i < nCols; i++) {
-            if (colIndices[i] > maxCol) {
-                maxCol = colIndices[i];
-            }
-        }
-        int nFeatures = (int) maxCol;
+    float[] data = new float[nNonZeros];
+    readRow(valuesLine, 0, nNonZeros, data);
 
-        if (nCols != nNonZeros || nNonZeros != (rowOffsets[nVectors] - 1) || nFeatures == 0 || nVectors == 0) {
-            throw new IOException("Unable to read input dataset");
-        }
+    bufferedReader.close();
 
-        return new CSRNumericTable(context, data, colIndices, rowOffsets, nFeatures, nVectors);
+    long maxCol = 0;
+    for (int i = 0; i < nCols; i++) {
+      if (colIndices[i] > maxCol) {
+        maxCol = colIndices[i];
+      }
     }
+    int nFeatures = (int) maxCol;
 
-    public static void printClassificationResult(float[] groundTruth, float[] classificationResults,
-            String classificatorName) {
-        System.out.println(classificatorName + " classification:");
-        System.out.println("Ground truth | Classification results");
-
-        for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
-            System.out.format("%+f\t\t%+f\n", groundTruth[i], classificationResults[i]);
-        }
+    if (nCols != nNonZeros || nNonZeros != (rowOffsets[nVectors] - 1)
+            || nFeatures == 0 || nVectors == 0) {
+      throw new IOException("Unable to read input dataset");
     }
 
-    public static void printClassificationResult(NumericTable groundTruth, NumericTable classificationResults,
-            String header1, String header2, String message, int nMaxRows) {
-        int nCols = (int) groundTruth.getNumberOfColumns();
-        int nRows = Math.min((int) groundTruth.getNumberOfRows(), nMaxRows);
+    return new CSRNumericTable(context, data, colIndices, rowOffsets, nFeatures, nVectors);
+  }
 
-        FloatBuffer dataGroundTruth = FloatBuffer.allocate(nCols * nRows);
-        FloatBuffer dataClassificationResults = FloatBuffer.allocate(nCols * nRows);
-        try {
-            dataGroundTruth = groundTruth.getBlockOfRows(0, nRows, dataGroundTruth);
-            dataClassificationResults = classificationResults.getBlockOfRows(0, nRows, dataClassificationResults);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        System.out.println(message);
-        System.out.println(header1 + "\t" + header2);
-        for (int i = 0; i < nRows; i++) {
-            for (int j = 0; j < 1; j++) {
-                System.out.format("%+.0f\t\t%+.0f\n", dataGroundTruth.get(i * nCols + j),
-                        dataClassificationResults.get(i * nCols + j));
-            }
-        }
+  public static void printClassificationResult(float[] groundTruth, float[] classificationResults,
+                                               String classificatorName) {
+    System.out.println(classificatorName + " classification:");
+    System.out.println("Ground truth | Classification results");
+
+    for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
+      System.out.format("%+f\t\t%+f\n", groundTruth[i], classificationResults[i]);
+    }
+  }
+
+  public static void printClassificationResult(NumericTable groundTruth,
+                                               NumericTable classificationResults,
+                                               String header1, String header2,
+                                               String message, int nMaxRows) {
+    int nCols = (int) groundTruth.getNumberOfColumns();
+    int nRows = Math.min((int) groundTruth.getNumberOfRows(), nMaxRows);
+
+    FloatBuffer dataGroundTruth = FloatBuffer.allocate(nCols * nRows);
+    FloatBuffer dataClassificationResults = FloatBuffer.allocate(nCols * nRows);
+    try {
+      dataGroundTruth = groundTruth.getBlockOfRows(0, nRows, dataGroundTruth);
+      dataClassificationResults = classificationResults.getBlockOfRows(0, nRows,
+              dataClassificationResults);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    System.out.println(message);
+    System.out.println(header1 + "\t" + header2);
+    for (int i = 0; i < nRows; i++) {
+      for (int j = 0; j < 1; j++) {
+        System.out.format("%+.0f\t\t%+.0f\n", dataGroundTruth.get(i * nCols + j),
+                dataClassificationResults.get(i * nCols + j));
+      }
     }
+  }
 
-    public static void printClassificationResult(long[] groundTruth, long[] classificationResults,
-            String classificatorName) {
-        System.out.println(classificatorName + " classification:");
-        System.out.println("Ground truth | Classification results");
+  public static void printClassificationResult(long[] groundTruth, long[] classificationResults,
+                                               String classificatorName) {
+    System.out.println(classificatorName + " classification:");
+    System.out.println("Ground truth | Classification results");
 
-        for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
-            System.out.format("%+d\t\t%+d\n", groundTruth[i], classificationResults[i]);
-        }
+    for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
+      System.out.format("%+d\t\t%+d\n", groundTruth[i], classificationResults[i]);
     }
+  }
 
-    public static void printClassificationResult(long[] groundTruth, int[] classificationResults,
-            String classificatorName) {
-        System.out.println(classificatorName + " classification:");
-        System.out.println("Ground truth | Classification results");
+  public static void printClassificationResult(long[] groundTruth, int[] classificationResults,
+                                               String classificatorName) {
+    System.out.println(classificatorName + " classification:");
+    System.out.println("Ground truth | Classification results");
 
-        for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
-            System.out.format("%+d\t\t%+d\n", groundTruth[i], classificationResults[i]);
-        }
+    for (int i = 0; i < Math.min(groundTruth.length, 20); i++) {
+      System.out.format("%+d\t\t%+d\n", groundTruth[i], classificationResults[i]);
     }
-
-    public static void printMatrix(double[] matrix, int nCols, int nRows, String header) {
-        System.out.println(header);
-        DecimalFormat numberFormat = new DecimalFormat("##0.00");
-        for (int i = 0; i < nRows; i++) {
-            for (int j = 0; j < nCols; j++) {
-                System.out.print(numberFormat.format(matrix[i * nCols + j]) + "\t\t");
-            }
-            System.out.println();
-        }
+  }
+
+  public static void printMatrix(double[] matrix, int nCols, int nRows, String header) {
+    System.out.println(header);
+    DecimalFormat numberFormat = new DecimalFormat("##0.00");
+    for (int i = 0; i < nRows; i++) {
+      for (int j = 0; j < nCols; j++) {
+        System.out.print(numberFormat.format(matrix[i * nCols + j]) + "\t\t");
+      }
+      System.out.println();
     }
-
-    public static void printTriangularMatrix(double[] triangularMatrix, int nDimensions, String header) {
-        int index = 0;
-        for (int i = 0; i < nDimensions; i++) {
-            for (int j = 0; j <= i; j++) {
-                System.out.print(triangularMatrix[index++] + "   ");
-            }
-            System.out.println();
-        }
+  }
+
+  public static void printTriangularMatrix(double[] triangularMatrix, int nDimensions,
+                                           String header) {
+    int index = 0;
+    for (int i = 0; i < nDimensions; i++) {
+      for (int j = 0; j <= i; j++) {
+        System.out.print(triangularMatrix[index++] + "   ");
+      }
+      System.out.println();
     }
-
-    public static void printPackedNumericTable(HomogenNumericTable nt, long nDimensions, String header) {
-        double[] results = nt.getDoubleArray();
-        printTriangularMatrix(results, (int) nDimensions, header);
+  }
+
+  public static void printPackedNumericTable(HomogenNumericTable nt, long nDimensions,
+                                             String header) {
+    double[] results = nt.getDoubleArray();
+    printTriangularMatrix(results, (int) nDimensions, header);
+  }
+
+  public static boolean isUpper(NumericTable.StorageLayout layout) {
+    return layout.ordinal() == NumericTable.StorageLayout.upperPackedSymmetricMatrix.ordinal()
+            || layout.ordinal() == NumericTable.StorageLayout.upperPackedTriangularMatrix.ordinal();
+  }
+
+  public static boolean isLower(NumericTable.StorageLayout layout) {
+    return layout.ordinal() == NumericTable.StorageLayout.lowerPackedSymmetricMatrix.ordinal()
+            || layout.ordinal() == NumericTable.StorageLayout.lowerPackedTriangularMatrix.ordinal();
+  }
+
+  public static void printNumericTable(String header, NumericTable nt,
+                                       long nPrintedRows, long nPrintedCols) {
+    long nNtCols = nt.getNumberOfColumns();
+    long nNtRows = nt.getNumberOfRows();
+    long nRows = nNtRows;
+    long nCols = nNtCols;
+
+    NumericTable.StorageLayout layout = nt.getDataLayout();
+
+    if (nPrintedRows > 0) {
+      nRows = Math.min(nNtRows, nPrintedRows);
     }
 
-    public static boolean isUpper(NumericTable.StorageLayout layout)
-    {
-        return layout.ordinal() == NumericTable.StorageLayout.upperPackedSymmetricMatrix.ordinal() ||
-                layout.ordinal() == NumericTable.StorageLayout.upperPackedTriangularMatrix.ordinal();
+    FloatBuffer result = FloatBuffer.allocate((int) (nNtCols * nRows));
+    try {
+      result = nt.getBlockOfRows(0, nRows, result);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
     }
-
-    public static boolean isLower(NumericTable.StorageLayout layout)
-    {
-        return layout.ordinal() == NumericTable.StorageLayout.lowerPackedSymmetricMatrix.ordinal() ||
-                layout.ordinal() == NumericTable.StorageLayout.lowerPackedTriangularMatrix.ordinal();
+    if (nPrintedCols > 0) {
+      nCols = Math.min(nNtCols, nPrintedCols);
     }
 
-    public static void printNumericTable(String header, NumericTable nt, long nPrintedRows, long nPrintedCols) {
-        long nNtCols = nt.getNumberOfColumns();
-        long nNtRows = nt.getNumberOfRows();
-        long nRows = nNtRows;
-        long nCols = nNtCols;
-
-        NumericTable.StorageLayout layout = nt.getDataLayout();
-
-        if (nPrintedRows > 0) {
-            nRows = Math.min(nNtRows, nPrintedRows);
-        }
+    StringBuilder builder = new StringBuilder();
+    builder.append(header);
+    builder.append("\n");
 
-        FloatBuffer result = FloatBuffer.allocate((int) (nNtCols * nRows));
-        try {
-            result = nt.getBlockOfRows(0, nRows, result);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
+    if (isLower(layout)) {
+      for (long i = 0; i < nRows; i++) {
+        for (long j = 0; j <= i; j++) {
+          String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
+          builder.append(tmp);
         }
-        if (nPrintedCols > 0) {
-            nCols = Math.min(nNtCols, nPrintedCols);
-        }
-
-        StringBuilder builder = new StringBuilder();
-        builder.append(header);
         builder.append("\n");
+      }
+    } else if (isUpper(layout)) {
 
-        if( isLower(layout) )
-        {
-            for (long i = 0; i < nRows; i++) {
-                for (long j = 0; j <= i; j++) {
-                    String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
-                    builder.append(tmp);
-                }
-                builder.append("\n");
-            }
-        }
-        else if( isUpper(layout) )
-        {
-
-            for (long i = 0; i < nRows; i++) {
-                for(int k=0; k < i; k++)
-                        builder.append("         ");
-                for (long j = i; j < nCols; j++) {
-                    String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
-                    builder.append(tmp);
-                }
-                builder.append("\n");
-            }
-
-        }
-        else if( isLower(layout) != true && isUpper(layout) != true)
-        {
-            for (long i = 0; i < nRows; i++) {
-                for (long j = 0; j < nCols; j++) {
-                    String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
-                    builder.append(tmp);
-                }
-                builder.append("\n");
-            }
-        }
-        System.out.println(builder.toString());
-    }
-
-    public static void printNumericTable(String header, CSRNumericTable nt, long nPrintedRows, long nPrintedCols) {
-        long[] rowOffsets = nt.getRowOffsetsArray();
-        long[] colIndices = nt.getColIndicesArray();
-        float[] values = nt.getFloatArray();
-
-        long nNtCols = nt.getNumberOfColumns();
-        long nNtRows = nt.getNumberOfRows();
-        long nRows = nNtRows;
-        long nCols = nNtCols;
-
-        if (nPrintedRows > 0) {
-            nRows = Math.min(nNtRows, nPrintedRows);
+      for (long i = 0; i < nRows; i++) {
+        for (int k = 0; k < i; k++) {
+          builder.append("         ");
         }
-
-        if (nPrintedCols > 0) {
-            nCols = Math.min(nNtCols, nPrintedCols);
+        for (long j = i; j < nCols; j++) {
+          String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
+          builder.append(tmp);
         }
-
-        StringBuilder builder = new StringBuilder();
-        builder.append(header);
         builder.append("\n");
+      }
 
-        float[] oneDenseRow = new float[(int) nCols];
-        for (int i = 0; i < nRows; i++) {
-            for (int j = 0; j < nCols; j++) {
-                oneDenseRow[j] = 0;
-            }
-            int nElementsInRow = (int) (rowOffsets[i + 1] - rowOffsets[i]);
-            for (int k = 0; k < nElementsInRow; k++) {
-                oneDenseRow[(int) (colIndices[(int) (rowOffsets[i] - 1 + k)] - 1)] = values[(int) (rowOffsets[i] - 1
-                        + k)];
-            }
-            for (int j = 0; j < nCols; j++) {
-                String tmp = String.format("%-6.3f   ", oneDenseRow[j]);
-                builder.append(tmp);
-            }
-            builder.append("\n");
+    } else if (isLower(layout) != true && isUpper(layout) != true) {
+      for (long i = 0; i < nRows; i++) {
+        for (long j = 0; j < nCols; j++) {
+          String tmp = String.format("%-6.3f   ", result.get((int) (i * nNtCols + j)));
+          builder.append(tmp);
         }
-        System.out.println(builder.toString());
+        builder.append("\n");
+      }
     }
-
-    public static void printNumericTable(String header, NumericTable nt, long nRows) {
-        printNumericTable(header, nt, nRows, nt.getNumberOfColumns());
+    System.out.println(builder.toString());
+  }
+
+  public static void printNumericTable(String header, CSRNumericTable nt,
+                                       long nPrintedRows, long nPrintedCols) {
+    long[] rowOffsets = nt.getRowOffsetsArray();
+    long[] colIndices = nt.getColIndicesArray();
+    float[] values = nt.getFloatArray();
+
+    long nNtCols = nt.getNumberOfColumns();
+    long nNtRows = nt.getNumberOfRows();
+    long nRows = nNtRows;
+    long nCols = nNtCols;
+
+    if (nPrintedRows > 0) {
+      nRows = Math.min(nNtRows, nPrintedRows);
     }
 
-    public static void printNumericTable(String header, NumericTable nt) {
-        printNumericTable(header, nt, nt.getNumberOfRows());
+    if (nPrintedCols > 0) {
+      nCols = Math.min(nNtCols, nPrintedCols);
     }
 
-    public static void printNumericTable(String header, CSRNumericTable nt, long nRows) {
-        printNumericTable(header, nt, nRows, nt.getNumberOfColumns());
+    StringBuilder builder = new StringBuilder();
+    builder.append(header);
+    builder.append("\n");
+
+    float[] oneDenseRow = new float[(int) nCols];
+    for (int i = 0; i < nRows; i++) {
+      for (int j = 0; j < nCols; j++) {
+        oneDenseRow[j] = 0;
+      }
+      int nElementsInRow = (int) (rowOffsets[i + 1] - rowOffsets[i]);
+      for (int k = 0; k < nElementsInRow; k++) {
+        oneDenseRow[(int) (colIndices[(int) (rowOffsets[i] - 1 + k)] - 1)]
+                = values[(int) (rowOffsets[i] - 1 + k)];
+      }
+      for (int j = 0; j < nCols; j++) {
+        String tmp = String.format("%-6.3f   ", oneDenseRow[j]);
+        builder.append(tmp);
+      }
+      builder.append("\n");
     }
-
-    public static void printNumericTable(String header, CSRNumericTable nt) {
-        printNumericTable(header, nt, nt.getNumberOfRows());
+    System.out.println(builder.toString());
+  }
+
+  public static void printNumericTable(String header, NumericTable nt, long nRows) {
+    printNumericTable(header, nt, nRows, nt.getNumberOfColumns());
+  }
+
+  public static void printNumericTable(String header, NumericTable nt) {
+    printNumericTable(header, nt, nt.getNumberOfRows());
+  }
+
+  public static void printNumericTable(String header, CSRNumericTable nt, long nRows) {
+    printNumericTable(header, nt, nRows, nt.getNumberOfColumns());
+  }
+
+  public static void printNumericTable(String header, CSRNumericTable nt) {
+    printNumericTable(header, nt, nt.getNumberOfRows());
+  }
+
+  public static void printNumericTables(NumericTable dataTable1, NumericTable dataTable2,
+                                        String title1, String title2,
+                                        String message, long nPrintedRows) {
+    long nRows1 = dataTable1.getNumberOfRows();
+    long nRows2 = dataTable2.getNumberOfRows();
+    long nCols1 = dataTable1.getNumberOfColumns();
+    long nCols2 = dataTable2.getNumberOfColumns();
+
+    long nRows = Math.min(nRows1, nRows2);
+    if (nPrintedRows > 0) {
+      nRows = Math.min(Math.min(nRows1, nRows2), nPrintedRows);
     }
 
-    public static void printNumericTables(NumericTable dataTable1, NumericTable dataTable2,String title1, String title2 ,
-                                            String message, long nPrintedRows)
-    {
-        long nRows1 = dataTable1.getNumberOfRows();
-        long nRows2 = dataTable2.getNumberOfRows();
-        long nCols1 = dataTable1.getNumberOfColumns();
-        long nCols2 = dataTable2.getNumberOfColumns();
-
-        long nRows = Math.min(nRows1, nRows2);
-        if (nPrintedRows > 0)
-        {
-            nRows = Math.min(Math.min(nRows1, nRows2), nPrintedRows);
-        }
-
-        FloatBuffer result1 = FloatBuffer.allocate((int) (nCols1 * nRows));
-        FloatBuffer result2 = FloatBuffer.allocate((int) (nCols2 * nRows));
-        try {
-            result1 = dataTable1.getBlockOfRows(0, nRows, result1);
-            result2 = dataTable2.getBlockOfRows(0, nRows, result2);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        StringBuilder builder = new StringBuilder();
-        builder.append(message);
-        builder.append("\n");
-        builder.append(title1);
-
-        StringBuilder builderHelp = new StringBuilder();
-        for (long j = 0; j < nCols1; j++) {
-                String tmp = String.format("%-6.3f   ", result1.get((int) (0 * nCols1 + j)));
-                builderHelp.append(tmp);
-            }
-        int interval = builderHelp.length() - title1.length();
-
-        for(int i=0; i < interval; i++)
-        {
-            builder.append(" ");
-        }
-        builder.append("     ");
-        builder.append(title2);
-        builder.append("\n");
-
-        for (long i = 0; i < nRows; i++) {
-            for (long j = 0; j < nCols1; j++) {
-                String tmp = String.format("%-6.3f   ", result1.get((int) (i * nCols1 + j)));
-                builder.append(tmp);
-            }
-            builder.append("     ");
-            for (long j = 0; j < nCols2; j++) {
-                String tmp = String.format("%-6.3f   ", result2.get((int) (i * nCols2 + j)));
-                builder.append(tmp);
-            }
-            builder.append("\n");
-        }
-        System.out.println(builder.toString());
-    }
-
-    public static void printAprioriItemsets(HomogenNumericTable largeItemsetsTable,
-            HomogenNumericTable largeItemsetsSupportTable) {
-        /* Get sizes of tables to store large item sets */
-        int nItemsInLargeItemsets = (int) largeItemsetsTable.getNumberOfRows();
-        int largeItemsetCount = (int) largeItemsetsSupportTable.getNumberOfRows();
-        int nItemsetToPrint = 20;
-
-        /* Get item sets and their support values */
-        IntBuffer bufLargeItemsets = IntBuffer
-                .allocate(nItemsInLargeItemsets * (int) largeItemsetsTable.getNumberOfColumns());
-        try {
-            bufLargeItemsets = largeItemsetsTable.getBlockOfRows(0, nItemsInLargeItemsets, bufLargeItemsets);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        int[] largeItemsets = new int[bufLargeItemsets.capacity()];
-        bufLargeItemsets.get(largeItemsets);
-
-        IntBuffer bufLargeItemsetsSupportData = IntBuffer
-                .allocate(largeItemsetCount * (int) largeItemsetsSupportTable.getNumberOfColumns());
-        try {
-        bufLargeItemsetsSupportData = largeItemsetsSupportTable.getBlockOfRows(0, largeItemsetCount,
-                bufLargeItemsetsSupportData);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        int[] largeItemsetsSupportData = new int[bufLargeItemsetsSupportData.capacity()];
-        bufLargeItemsetsSupportData.get(largeItemsetsSupportData);
-
-        ArrayList<ArrayList<Integer>> largeItemsetsVector = new ArrayList<ArrayList<Integer>>(largeItemsetCount);
-
-        for (int i = 0; i < largeItemsetCount; i++) {
-            largeItemsetsVector.add(new ArrayList<Integer>());
-        }
+    FloatBuffer result1 = FloatBuffer.allocate((int) (nCols1 * nRows));
+    FloatBuffer result2 = FloatBuffer.allocate((int) (nCols2 * nRows));
+    try {
+      result1 = dataTable1.getBlockOfRows(0, nRows, result1);
+      result2 = dataTable2.getBlockOfRows(0, nRows, result2);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    StringBuilder builder = new StringBuilder();
+    builder.append(message);
+    builder.append("\n");
+    builder.append(title1);
+
+    StringBuilder builderHelp = new StringBuilder();
+    for (long j = 0; j < nCols1; j++) {
+      String tmp = String.format("%-6.3f   ", result1.get((int) (0 * nCols1 + j)));
+      builderHelp.append(tmp);
+    }
+    int interval = builderHelp.length() - title1.length();
 
-        for (int i = 0; i < nItemsInLargeItemsets; i++) {
-            largeItemsetsVector.get(largeItemsets[2 * i]).add(largeItemsets[2 * i + 1]);
-        }
+    for (int i = 0; i < interval; i++) {
+      builder.append(" ");
+    }
+    builder.append("     ");
+    builder.append(title2);
+    builder.append("\n");
+
+    for (long i = 0; i < nRows; i++) {
+      for (long j = 0; j < nCols1; j++) {
+        String tmp = String.format("%-6.3f   ", result1.get((int) (i * nCols1 + j)));
+        builder.append(tmp);
+      }
+      builder.append("     ");
+      for (long j = 0; j < nCols2; j++) {
+        String tmp = String.format("%-6.3f   ", result2.get((int) (i * nCols2 + j)));
+        builder.append(tmp);
+      }
+      builder.append("\n");
+    }
+    System.out.println(builder.toString());
+  }
+
+  public static void printAprioriItemsets(HomogenNumericTable largeItemsetsTable,
+                                          HomogenNumericTable largeItemsetsSupportTable) {
+    /* Get sizes of tables to store large item sets */
+    int nItemsInLargeItemsets = (int) largeItemsetsTable.getNumberOfRows();
+    int largeItemsetCount = (int) largeItemsetsSupportTable.getNumberOfRows();
+    int nItemsetToPrint = 20;
+
+    /* Get item sets and their support values */
+    IntBuffer bufLargeItemsets = IntBuffer
+            .allocate(nItemsInLargeItemsets * (int) largeItemsetsTable.getNumberOfColumns());
+    try {
+      bufLargeItemsets = largeItemsetsTable.getBlockOfRows(0, nItemsInLargeItemsets,
+              bufLargeItemsets);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    int[] largeItemsets = new int[bufLargeItemsets.capacity()];
+    bufLargeItemsets.get(largeItemsets);
+
+    IntBuffer bufLargeItemsetsSupportData = IntBuffer
+            .allocate(largeItemsetCount * (int) largeItemsetsSupportTable.getNumberOfColumns());
+    try {
+      bufLargeItemsetsSupportData = largeItemsetsSupportTable.getBlockOfRows(0, largeItemsetCount,
+              bufLargeItemsetsSupportData);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    int[] largeItemsetsSupportData = new int[bufLargeItemsetsSupportData.capacity()];
+    bufLargeItemsetsSupportData.get(largeItemsetsSupportData);
 
-        ArrayList<Integer> supportVector = new ArrayList<Integer>(largeItemsetCount);
-        for (int i = 0; i < largeItemsetCount; i++) {
-            supportVector.add(0);
-        }
+    ArrayList<ArrayList<Integer>> largeItemsetsVector
+            = new ArrayList<ArrayList<Integer>>(largeItemsetCount);
 
-        for (int i = 0; i < largeItemsetCount; i++) {
-            int index = largeItemsetsSupportData[2 * i];
-            supportVector.set(index, largeItemsetsSupportData[2 * i + 1]);
-        }
+    for (int i = 0; i < largeItemsetCount; i++) {
+      largeItemsetsVector.add(new ArrayList<Integer>());
+    }
 
-        System.out.println("\nApriori example program results");
-        System.out.println("\nLast " + nItemsetToPrint + " large itemsets: ");
-        System.out.println("\nItemset\t\t\tSupport");
+    for (int i = 0; i < nItemsInLargeItemsets; i++) {
+      largeItemsetsVector.get(largeItemsets[2 * i]).add(largeItemsets[2 * i + 1]);
+    }
 
-        int iMin = ((largeItemsetCount > nItemsetToPrint) ? largeItemsetCount - nItemsetToPrint : 0);
-        for (int i = iMin; i < largeItemsetCount; i++) {
-            System.out.print("{");
-            for (int l = 0; l < largeItemsetsVector.get(i).size() - 1; l++) {
-                System.out.print(largeItemsetsVector.get(i).get(l) + ", ");
-            }
-            System.out.print(largeItemsetsVector.get(i).get(largeItemsetsVector.get(i).size() - 1) + "}\t\t");
+    ArrayList<Integer> supportVector = new ArrayList<Integer>(largeItemsetCount);
+    for (int i = 0; i < largeItemsetCount; i++) {
+      supportVector.add(0);
+    }
 
-            System.out.println(supportVector.get(i));
-        }
+    for (int i = 0; i < largeItemsetCount; i++) {
+      int index = largeItemsetsSupportData[2 * i];
+      supportVector.set(index, largeItemsetsSupportData[2 * i + 1]);
     }
 
-    public static void printAprioriRules(HomogenNumericTable leftItemsTable, HomogenNumericTable rightItemsTable,
-            HomogenNumericTable confidenceTable) {
-        int nRulesToPrint = 20;
-        /* Get sizes of tables to store association rules */
-        int nLeftItems = (int) leftItemsTable.getNumberOfRows();
-        int nRightItems = (int) rightItemsTable.getNumberOfRows();
-        int nRules = (int) confidenceTable.getNumberOfRows();
+    System.out.println("\nApriori example program results");
+    System.out.println("\nLast " + nItemsetToPrint + " large itemsets: ");
+    System.out.println("\nItemset\t\t\tSupport");
 
-        /* Get association rules data */
+    int iMin = ((largeItemsetCount > nItemsetToPrint) ? largeItemsetCount - nItemsetToPrint : 0);
+    for (int i = iMin; i < largeItemsetCount; i++) {
+      System.out.print("{");
+      for (int l = 0; l < largeItemsetsVector.get(i).size() - 1; l++) {
+        System.out.print(largeItemsetsVector.get(i).get(l) + ", ");
+      }
+      System.out.print(largeItemsetsVector.get(i).get(
+              largeItemsetsVector.get(i).size() - 1) + "}\t\t");
 
-        IntBuffer bufLeftItems = IntBuffer.allocate(nLeftItems * (int) leftItemsTable.getNumberOfColumns());
-        try {
-            bufLeftItems = leftItemsTable.getBlockOfRows(0, nLeftItems, bufLeftItems);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        int[] leftItems = new int[bufLeftItems.capacity()];
-        bufLeftItems.get(leftItems);
-
-        IntBuffer bufRightItems = IntBuffer.allocate(nRightItems * (int) rightItemsTable.getNumberOfColumns());
-        try {
-            bufRightItems = rightItemsTable.getBlockOfRows(0, nRightItems, bufRightItems);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        int[] rightItems = new int[bufRightItems.capacity()];
-        bufRightItems.get(rightItems);
-
-        FloatBuffer bufConfidence = FloatBuffer.allocate(nRules * (int) confidenceTable.getNumberOfColumns());
-        try {
-            bufConfidence = confidenceTable.getBlockOfRows(0, nRules, bufConfidence);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        float[] confidence = new float[bufConfidence.capacity()];
-        bufConfidence.get(confidence);
+      System.out.println(supportVector.get(i));
+    }
+  }
+
+  public static void printAprioriRules(HomogenNumericTable leftItemsTable,
+                                       HomogenNumericTable rightItemsTable,
+                                       HomogenNumericTable confidenceTable) {
+    int nRulesToPrint = 20;
+    /* Get sizes of tables to store association rules */
+    int nLeftItems = (int) leftItemsTable.getNumberOfRows();
+    int nRightItems = (int) rightItemsTable.getNumberOfRows();
+    int nRules = (int) confidenceTable.getNumberOfRows();
+
+    /* Get association rules data */
+
+    IntBuffer bufLeftItems = IntBuffer.allocate(
+            nLeftItems * (int) leftItemsTable.getNumberOfColumns());
+    try {
+      bufLeftItems = leftItemsTable.getBlockOfRows(0, nLeftItems, bufLeftItems);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    int[] leftItems = new int[bufLeftItems.capacity()];
+    bufLeftItems.get(leftItems);
+
+    IntBuffer bufRightItems = IntBuffer.allocate(
+            nRightItems * (int) rightItemsTable.getNumberOfColumns());
+    try {
+      bufRightItems = rightItemsTable.getBlockOfRows(0, nRightItems, bufRightItems);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    int[] rightItems = new int[bufRightItems.capacity()];
+    bufRightItems.get(rightItems);
+
+    FloatBuffer bufConfidence = FloatBuffer.allocate(
+            nRules * (int) confidenceTable.getNumberOfColumns());
+    try {
+      bufConfidence = confidenceTable.getBlockOfRows(0, nRules, bufConfidence);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    float[] confidence = new float[bufConfidence.capacity()];
+    bufConfidence.get(confidence);
 
-        ArrayList<ArrayList<Integer>> leftItemsVector = new ArrayList<ArrayList<Integer>>(nRules);
-        for (int i = 0; i < nRules; i++) {
-            leftItemsVector.add(new ArrayList<Integer>());
-        }
+    ArrayList<ArrayList<Integer>> leftItemsVector = new ArrayList<ArrayList<Integer>>(nRules);
+    for (int i = 0; i < nRules; i++) {
+      leftItemsVector.add(new ArrayList<Integer>());
+    }
 
-        if (nRules == 0) {
-            System.out.println("No association rules were found ");
-            return;
-        }
+    if (nRules == 0) {
+      System.out.println("No association rules were found ");
+      return;
+    }
 
-        for (int i = 0; i < nLeftItems; i++) {
-            leftItemsVector.get((leftItems[2 * i])).add(leftItems[2 * i + 1]);
-        }
+    for (int i = 0; i < nLeftItems; i++) {
+      leftItemsVector.get((leftItems[2 * i])).add(leftItems[2 * i + 1]);
+    }
 
-        ArrayList<ArrayList<Integer>> rightItemsVector = new ArrayList<ArrayList<Integer>>(nRules);
-        for (int i = 0; i < nRules; i++) {
-            rightItemsVector.add(new ArrayList<Integer>());
-        }
+    ArrayList<ArrayList<Integer>> rightItemsVector = new ArrayList<ArrayList<Integer>>(nRules);
+    for (int i = 0; i < nRules; i++) {
+      rightItemsVector.add(new ArrayList<Integer>());
+    }
 
-        for (int i = 0; i < nRightItems; i++) {
-            rightItemsVector.get((rightItems[2 * i])).add(rightItems[2 * i + 1]);
-        }
+    for (int i = 0; i < nRightItems; i++) {
+      rightItemsVector.get((rightItems[2 * i])).add(rightItems[2 * i + 1]);
+    }
 
-        ArrayList<Float> confidenceVector = new ArrayList<Float>(nRules);
-        for (int i = 0; i < nRules; i++) {
-            confidenceVector.add(confidence[i]);
-        }
+    ArrayList<Float> confidenceVector = new ArrayList<Float>(nRules);
+    for (int i = 0; i < nRules; i++) {
+      confidenceVector.add(confidence[i]);
+    }
 
-        System.out.println("\nLast " + nRulesToPrint + " association rules: ");
-        System.out.println("\nRule" + "\t\t\t\tConfidence");
+    System.out.println("\nLast " + nRulesToPrint + " association rules: ");
+    System.out.println("\nRule" + "\t\t\t\tConfidence");
 
-        int iMin = ((nRules > nRulesToPrint) ? (nRules - nRulesToPrint) : 0);
-        for (int i = iMin; i < nRules; i++) {
-            System.out.print("{");
-            for (int l = 0; l < leftItemsVector.get(i).size() - 1; l++) {
-                System.out.print(leftItemsVector.get(i).get(l) + ", ");
-            }
-            System.out.print(leftItemsVector.get(i).get(leftItemsVector.get(i).size() - 1) + "} => {");
+    int iMin = ((nRules > nRulesToPrint) ? (nRules - nRulesToPrint) : 0);
+    for (int i = iMin; i < nRules; i++) {
+      System.out.print("{");
+      for (int l = 0; l < leftItemsVector.get(i).size() - 1; l++) {
+        System.out.print(leftItemsVector.get(i).get(l) + ", ");
+      }
+      System.out.print(leftItemsVector.get(i).get(leftItemsVector.get(i).size() - 1) + "} => {");
 
-            for (int l = 0; l < rightItemsVector.get(i).size() - 1; l++) {
-                System.out.print(rightItemsVector.get(i).get(l) + ", ");
-            }
-            System.out.print(rightItemsVector.get(i).get(rightItemsVector.get(i).size() - 1) + "}\t\t");
+      for (int l = 0; l < rightItemsVector.get(i).size() - 1; l++) {
+        System.out.print(rightItemsVector.get(i).get(l) + ", ");
+      }
+      System.out.print(rightItemsVector.get(i).get(rightItemsVector.get(i).size() - 1) + "}\t\t");
 
-            System.out.println(confidenceVector.get(i));
-        }
+      System.out.println(confidenceVector.get(i));
     }
-
-    public static void printALSRatings(NumericTable usersOffsetTable, NumericTable itemsOffsetTable,
-                                       NumericTable ratings) {
-        long nUsers = ratings.getNumberOfRows();
-        long nItems = ratings.getNumberOfColumns();
-
-        float[] ratingsData = ((HomogenNumericTable)ratings).getFloatArray();
-        IntBuffer usersOffsetBuf = IntBuffer.allocate(1);
-        IntBuffer itemsOffsetBuf = IntBuffer.allocate(1);
-        try {
-            usersOffsetBuf = usersOffsetTable.getBlockOfRows(0, 1, usersOffsetBuf);
-            itemsOffsetBuf = itemsOffsetTable.getBlockOfRows(0, 1, itemsOffsetBuf);
-        } catch (IllegalAccessException e) {
-            ErrorHandling.printThrowable(e);
-            return;
-        }
-        int[] usersOffsetData = new int[1];
-        int[] itemsOffsetData = new int[1];
-        usersOffsetBuf.get(usersOffsetData);
-        itemsOffsetBuf.get(itemsOffsetData);
-        long usersOffset = (long)usersOffsetData[0];
-        long itemsOffset = (long)itemsOffsetData[0];
-
-        System.out.println(" User ID, Item ID, rating");
-        for (long i = 0; i < nUsers; i++) {
-            for (long j = 0; j < nItems; j++) {
-                long userId = i + usersOffset;
-                long itemId = j + itemsOffset;
-                System.out.println(userId + ", " + itemId + ", " + ratingsData[(int)(i * nItems + j)]);
-            }
-        }
+  }
+
+  public static void printALSRatings(NumericTable usersOffsetTable, NumericTable itemsOffsetTable,
+                                     NumericTable ratings) {
+    long nUsers = ratings.getNumberOfRows();
+    long nItems = ratings.getNumberOfColumns();
+
+    float[] ratingsData = ((HomogenNumericTable) ratings).getFloatArray();
+    IntBuffer usersOffsetBuf = IntBuffer.allocate(1);
+    IntBuffer itemsOffsetBuf = IntBuffer.allocate(1);
+    try {
+      usersOffsetBuf = usersOffsetTable.getBlockOfRows(0, 1, usersOffsetBuf);
+      itemsOffsetBuf = itemsOffsetTable.getBlockOfRows(0, 1, itemsOffsetBuf);
+    } catch (IllegalAccessException e) {
+      ErrorHandling.printThrowable(e);
+      return;
+    }
+    int[] usersOffsetData = new int[1];
+    int[] itemsOffsetData = new int[1];
+    usersOffsetBuf.get(usersOffsetData);
+    itemsOffsetBuf.get(itemsOffsetData);
+    long usersOffset = usersOffsetData[0];
+    long itemsOffset = itemsOffsetData[0];
+
+    System.out.println(" User ID, Item ID, rating");
+    for (long i = 0; i < nUsers; i++) {
+      for (long j = 0; j < nItems; j++) {
+        long userId = i + usersOffset;
+        long itemId = j + itemsOffset;
+        System.out.println(userId + ", " + itemId + ", " + ratingsData[(int) (i * nItems + j)]);
+      }
     }
+  }
 }
diff --git a/mllib-dal/src/main/native/.clang-format b/mllib-dal/src/main/native/.clang-format
new file mode 100644
index 000000000..0a9ea9178
--- /dev/null
+++ b/mllib-dal/src/main/native/.clang-format
@@ -0,0 +1,136 @@
+---
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        Latest
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        4
+UseCRLF:         false
+UseTab:          Never
+...
+
diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp
index 29162fddd..157b39bae 100644
--- a/mllib-dal/src/main/native/ALSDALImpl.cpp
+++ b/mllib-dal/src/main/native/ALSDALImpl.cpp
@@ -1,13 +1,30 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
 #include <assert.h>
-#include <daal.h>
 #include <chrono>
+#include <daal.h>
 #include <iostream>
 
 #include "OneCCL.h"
-#include "ALSShuffle.h"
 #include "org_apache_spark_ml_recommendation_ALSDALImpl.h"
 #include "service.h"
 
+#include "ALSShuffle.h"
+
 using namespace std;
 using namespace daal;
 using namespace daal::algorithms;
@@ -20,9 +37,6 @@ typedef float algorithmFPType; /* Algorithm floating-point type */
 NumericTablePtr userOffset;
 NumericTablePtr itemOffset;
 
-// KeyValueDataCollectionPtr userOffsetsOnMaster;
-// KeyValueDataCollectionPtr itemOffsetsOnMaster;
-
 CSRNumericTablePtr dataTable;
 CSRNumericTablePtr transposedDataTable;
 
@@ -31,492 +45,406 @@ KeyValueDataCollectionPtr itemStep3LocalInput;
 
 training::DistributedPartialResultStep4Ptr itemsPartialResultLocal;
 training::DistributedPartialResultStep4Ptr usersPartialResultLocal;
-std::vector<training::DistributedPartialResultStep4Ptr> itemsPartialResultsMaster;
-std::vector<training::DistributedPartialResultStep4Ptr> usersPartialResultsMaster;
+std::vector<training::DistributedPartialResultStep4Ptr>
+    itemsPartialResultsMaster;
+std::vector<training::DistributedPartialResultStep4Ptr>
+    usersPartialResultsMaster;
+
+template <typename T>
+void gather(size_t rankId, ccl::communicator &comm, size_t nBlocks,
+            const ByteBuffer &nodeResults, T *result) {
+    vector<size_t> perNodeArchLengthMaster(nBlocks);
+    size_t perNodeArchLength = nodeResults.size();
+    ByteBuffer serializedData;
+
+    vector<size_t> recv_counts(nBlocks);
+    for (size_t i = 0; i < nBlocks; i++)
+        recv_counts[i] = sizeof(size_t);
+
+    ccl::allgatherv(&perNodeArchLength, sizeof(size_t),
+                    perNodeArchLengthMaster.data(), recv_counts,
+                    ccl::datatype::uint8, comm)
+        .wait();
+
+    // should resize for all ranks for ccl_allgatherv
+    size_t memoryBuf = 0;
+    for (size_t i = 0; i < nBlocks; i++) {
+        memoryBuf += perNodeArchLengthMaster[i];
+    }
+    serializedData.resize(memoryBuf);
+
+    std::vector<int> displs(nBlocks);
+    if (rankId == ccl_root) {
+        size_t shift = 0;
+        for (size_t i = 0; i < nBlocks; i++) {
+            displs[i] = shift;
+            shift += perNodeArchLengthMaster[i];
+        }
+    }
+
+    /* Transfer partial results to step 2 on the root node */
+    ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
+                    perNodeArchLengthMaster, ccl::datatype::uint8, comm)
+        .wait();
+
+    if (rankId == ccl_root) {
+        for (size_t i = 0; i < nBlocks; i++) {
+            /* Deserialize partial results from step 1 */
+            result[i] = result[i]->cast(deserializeDAALObject(
+                &serializedData[0] + displs[i], perNodeArchLengthMaster[i]));
+        }
+    }
+}
 
 template <typename T>
-void gather(size_t rankId, ccl::communicator &comm, size_t nBlocks, const ByteBuffer& nodeResults, T* result) {
-  vector<size_t> perNodeArchLengthMaster(nBlocks);
-  size_t perNodeArchLength = nodeResults.size();
-  ByteBuffer serializedData;  
-
-  vector<size_t> recv_counts(nBlocks);
-  for (size_t i = 0; i < nBlocks; i++) recv_counts[i] = sizeof(size_t);
-
-  // MPI_Gather(&perNodeArchLength, sizeof(int), MPI_CHAR, perNodeArchLengthMaster,
-  // sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-  ccl::allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster.data(), recv_counts,
-                 ccl::datatype::uint8, comm).wait();    
-
-  // should resize for all ranks for ccl_allgatherv
-  size_t memoryBuf = 0;
-  for (size_t i = 0; i < nBlocks; i++) {
-    memoryBuf += perNodeArchLengthMaster[i];
-  }
-  serializedData.resize(memoryBuf);
-
-  std::vector<int> displs(nBlocks);
-  if (rankId == ccl_root) {
+void all2all(ccl::communicator &comm, ByteBuffer *nodeResults, size_t nBlocks,
+             KeyValueDataCollectionPtr result) {
+    size_t memoryBuf = 0;
     size_t shift = 0;
+    vector<size_t> perNodeArchLengths(nBlocks);
+    vector<size_t> perNodeArchLengthsRecv(nBlocks);
+    std::vector<size_t> sdispls(nBlocks);
+    ByteBuffer serializedSendData;
+    ByteBuffer serializedRecvData;
+
+    for (size_t i = 0; i < nBlocks; i++) {
+        perNodeArchLengths[i] = nodeResults[i].size();
+        memoryBuf += perNodeArchLengths[i];
+        sdispls[i] = shift;
+        shift += perNodeArchLengths[i];
+    }
+    serializedSendData.resize(memoryBuf);
+
+    /* memcpy to avoid double compute */
+    memoryBuf = 0;
     for (size_t i = 0; i < nBlocks; i++) {
-      displs[i] = shift;
-      shift += perNodeArchLengthMaster[i];
+        for (size_t j = 0; j < perNodeArchLengths[i]; j++)
+            serializedSendData[memoryBuf + j] = nodeResults[i][j];
+        memoryBuf += perNodeArchLengths[i];
     }
-  }
 
-  /* Transfer partial results to step 2 on the root node */
-  // MPI_Gatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-  // perNodeArchLengthMaster, displs, MPI_CHAR, ccl_root,
-  //             MPI_COMM_WORLD);
-  ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-                 perNodeArchLengthMaster, ccl::datatype::uint8, comm).wait();  
+    ccl::alltoall(perNodeArchLengths.data(), perNodeArchLengthsRecv.data(),
+                  sizeof(size_t), ccl::datatype::uint8, comm)
+        .wait();
 
-  if (rankId == ccl_root) {
+    memoryBuf = 0;
+    shift = 0;
+    std::vector<size_t> rdispls(nBlocks);
     for (size_t i = 0; i < nBlocks; i++) {
-      /* Deserialize partial results from step 1 */
-      result[i] = result[i]->cast(deserializeDAALObject(&serializedData[0] + displs[i],
-                                                        perNodeArchLengthMaster[i]));
+        memoryBuf += perNodeArchLengthsRecv[i];
+        rdispls[i] = shift;
+        shift += perNodeArchLengthsRecv[i];
     }
-  }
-}
 
-// void gatherUsers(const ByteBuffer & nodeResults, int nBlocks)
-// {
-//     size_t perNodeArchLengthMaster[nBlocks];
-//     size_t perNodeArchLength = nodeResults.size();
-//     ByteBuffer serializedData;
-//     size_t recv_counts[nBlocks];
-//     for (int i = 0; i < nBlocks; i++) {
-//         recv_counts[i] = sizeof(size_t);
-//     }
-
-//     ccl_request_t request;
-//     // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
-//     perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
-//     recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
-
-//     size_t memoryBuf = 0;
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         memoryBuf += perNodeArchLengthMaster[i];
-//     }
-//     serializedData.resize(memoryBuf);
-
-//     size_t shift = 0;
-//     std::vector<int> displs(nBlocks);
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         displs[i] = shift;
-//         shift += perNodeArchLengthMaster[i];
-//     }
-
-//     /* Transfer partial results to step 2 on the root node */
-//     // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-//     perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-//     perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
-//     ccl_wait(request);
-
-//     usersPartialResultsMaster.resize(nBlocks);
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         /* Deserialize partial results from step 4 */
-//         usersPartialResultsMaster[i] =
-//             training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
-//             + displs[i], perNodeArchLengthMaster[i]));
-//     }
-// }
-
-// void gatherItems(const ByteBuffer & nodeResults, size_t nBlocks)
-// {
-//     size_t perNodeArchLengthMaster[nBlocks];
-//     size_t perNodeArchLength = nodeResults.size();
-//     ByteBuffer serializedData;
-//     size_t recv_counts[nBlocks];
-//     for (size_t i = 0; i < nBlocks; i++) {
-//         recv_counts[i] = sizeof(size_t);
-//     }
-
-//     ccl_request_t request;
-//     // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
-//     perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
-//     recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
-
-//     size_t memoryBuf = 0;
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         memoryBuf += perNodeArchLengthMaster[i];
-//     }
-//     serializedData.resize(memoryBuf);
-
-//     size_t shift = 0;
-//     std::vector<int> displs(nBlocks);
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         displs[i] = shift;
-//         shift += perNodeArchLengthMaster[i];
-//     }
-
-//     /* Transfer partial results to step 2 on the root node */
-//     // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-//     perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-//     perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
-//     ccl_wait(request);
-
-//     itemsPartialResultsMaster.resize(nBlocks);
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         /* Deserialize partial results from step 4 */
-//         itemsPartialResultsMaster[i] =
-//             training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
-//             + displs[i], perNodeArchLengthMaster[i]));
-//     }
-// }
+    serializedRecvData.resize(memoryBuf);
 
-template <typename T>
-void all2all(ccl::communicator &comm, ByteBuffer* nodeResults, size_t nBlocks, KeyValueDataCollectionPtr result) {
-  size_t memoryBuf = 0;
-  size_t shift = 0;
-  vector<size_t> perNodeArchLengths(nBlocks);
-  vector<size_t> perNodeArchLengthsRecv(nBlocks);
-  std::vector<size_t> sdispls(nBlocks);
-  ByteBuffer serializedSendData;
-  ByteBuffer serializedRecvData;
-
-  for (size_t i = 0; i < nBlocks; i++) {
-    perNodeArchLengths[i] = nodeResults[i].size();
-    memoryBuf += perNodeArchLengths[i];
-    sdispls[i] = shift;
-    shift += perNodeArchLengths[i];
-  }
-  serializedSendData.resize(memoryBuf);
-
-  /* memcpy to avoid double compute */
-  memoryBuf = 0;
-  for (size_t i = 0; i < nBlocks; i++) {
-    for (size_t j = 0; j < perNodeArchLengths[i]; j++)
-      serializedSendData[memoryBuf + j] = nodeResults[i][j];
-    memoryBuf += perNodeArchLengths[i];
-  }
-
-  // MPI_Alltoall(perNodeArchLengths, sizeof(int), MPI_CHAR, perNodeArchLengthsRecv,
-  // sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-  ccl::alltoall(perNodeArchLengths.data(), perNodeArchLengthsRecv.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
-
-  memoryBuf = 0;
-  shift = 0;
-  std::vector<size_t> rdispls(nBlocks);
-  for (size_t i = 0; i < nBlocks; i++) {
-    memoryBuf += perNodeArchLengthsRecv[i];
-    rdispls[i] = shift;
-    shift += perNodeArchLengthsRecv[i];
-  }
-
-  serializedRecvData.resize(memoryBuf);
-
-  /* Transfer partial results to step 2 on the root node */
-  // MPI_Alltoallv(&serializedSendData[0], perNodeArchLengths, sdispls, MPI_CHAR,
-  // &serializedRecvData[0], perNodeArchLengthsRecv, rdispls, MPI_CHAR,
-  //               MPI_COMM_WORLD);
-  ccl::alltoallv(&serializedSendData[0], perNodeArchLengths, &serializedRecvData[0],
-                perNodeArchLengthsRecv, ccl::datatype::uint8, comm).wait();
-
-  for (size_t i = 0; i < nBlocks; i++) {
-    (*result)[i] = T::cast(deserializeDAALObject(&serializedRecvData[rdispls[i]],
-                                                 perNodeArchLengthsRecv[i]));
-  }
+    /* Transfer partial results to step 2 on the root node */
+    ccl::alltoallv(&serializedSendData[0], perNodeArchLengths,
+                   &serializedRecvData[0], perNodeArchLengthsRecv,
+                   ccl::datatype::uint8, comm)
+        .wait();
+
+    for (size_t i = 0; i < nBlocks; i++) {
+        (*result)[i] = T::cast(deserializeDAALObject(
+            &serializedRecvData[rdispls[i]], perNodeArchLengthsRecv[i]));
+    }
 }
 
-KeyValueDataCollectionPtr initializeStep1Local(size_t rankId, size_t partitionId,
+KeyValueDataCollectionPtr initializeStep1Local(size_t rankId,
+                                               size_t partitionId,
                                                size_t nBlocks, size_t nUsers,
                                                size_t nFactors) {
-  int usersPartition[1] = {(int)nBlocks};
-
-  /* Create an algorithm object to initialize the implicit ALS model with the default
-   * method */
-  training::init::Distributed<step1Local, algorithmFPType, training::init::fastCSR>
-      initAlgorithm;
-  initAlgorithm.parameter.fullNUsers = nUsers;
-  initAlgorithm.parameter.nFactors = nFactors;
-  initAlgorithm.parameter.seed += rankId;
-  initAlgorithm.parameter.partition.reset(
-      new HomogenNumericTable<int>((int*)usersPartition, 1, 1));
-  /* Pass a training data set and dependent values to the algorithm */
-  initAlgorithm.input.set(training::init::data, dataTable);
-
-  /* Initialize the implicit ALS model */
-  initAlgorithm.compute();
-
-  training::init::PartialResultPtr partialResult = initAlgorithm.getPartialResult();
-  itemStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
-  userOffset = partialResult->get(training::init::offsets, (size_t)rankId);
-  // if (rankId == ccl_root)
-  // {
-  //     userOffsetsOnMaster = partialResult->get(training::init::offsets);
-  // }
-  PartialModelPtr partialModelLocal = partialResult->get(training::init::partialModel);
-
-  itemsPartialResultLocal.reset(new training::DistributedPartialResultStep4());
-  itemsPartialResultLocal->set(training::outputOfStep4ForStep1, partialModelLocal);
-
-  return partialResult->get(training::init::outputOfStep1ForStep2);
+    int usersPartition[1] = {(int)nBlocks};
+
+    /* Create an algorithm object to initialize the implicit ALS model with the
+     * default method */
+    training::init::Distributed<step1Local, algorithmFPType,
+                                training::init::fastCSR>
+        initAlgorithm;
+    initAlgorithm.parameter.fullNUsers = nUsers;
+    initAlgorithm.parameter.nFactors = nFactors;
+    initAlgorithm.parameter.seed += rankId;
+    initAlgorithm.parameter.partition.reset(
+        new HomogenNumericTable<int>((int *)usersPartition, 1, 1));
+    /* Pass a training data set and dependent values to the algorithm */
+    initAlgorithm.input.set(training::init::data, dataTable);
+
+    /* Initialize the implicit ALS model */
+    initAlgorithm.compute();
+
+    training::init::PartialResultPtr partialResult =
+        initAlgorithm.getPartialResult();
+    itemStep3LocalInput =
+        partialResult->get(training::init::outputOfInitForComputeStep3);
+    userOffset = partialResult->get(training::init::offsets, (size_t)rankId);
+
+    PartialModelPtr partialModelLocal =
+        partialResult->get(training::init::partialModel);
+
+    itemsPartialResultLocal.reset(
+        new training::DistributedPartialResultStep4());
+    itemsPartialResultLocal->set(training::outputOfStep4ForStep1,
+                                 partialModelLocal);
+
+    return partialResult->get(training::init::outputOfStep1ForStep2);
 }
 
-void initializeStep2Local(size_t rankId, size_t partitionId,
-                          const KeyValueDataCollectionPtr& initStep2LocalInput) {
-  /* Create an algorithm object to perform the second step of the implicit ALS
-   * initialization algorithm */
-  training::init::Distributed<step2Local, algorithmFPType, training::init::fastCSR>
-      initAlgorithm;
-
-  initAlgorithm.input.set(training::init::inputOfStep2FromStep1, initStep2LocalInput);
-
-  /* Compute partial results of the second step on local nodes */
-  initAlgorithm.compute();
-
-  training::init::DistributedPartialResultStep2Ptr partialResult =
-      initAlgorithm.getPartialResult();
-  transposedDataTable =
-      CSRNumericTable::cast(partialResult->get(training::init::transposedData));
-  userStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
-  itemOffset = partialResult->get(training::init::offsets, (size_t)rankId);
-  // if (rankId == ccl_root)
-  // {
-  //     itemOffsetsOnMaster = partialResult->get(training::init::offsets);
-  // }
+void initializeStep2Local(
+    size_t rankId, size_t partitionId,
+    const KeyValueDataCollectionPtr &initStep2LocalInput) {
+    /* Create an algorithm object to perform the second step of the implicit ALS
+     * initialization algorithm */
+    training::init::Distributed<step2Local, algorithmFPType,
+                                training::init::fastCSR>
+        initAlgorithm;
+
+    initAlgorithm.input.set(training::init::inputOfStep2FromStep1,
+                            initStep2LocalInput);
+
+    /* Compute partial results of the second step on local nodes */
+    initAlgorithm.compute();
+
+    training::init::DistributedPartialResultStep2Ptr partialResult =
+        initAlgorithm.getPartialResult();
+    transposedDataTable = CSRNumericTable::cast(
+        partialResult->get(training::init::transposedData));
+    userStep3LocalInput =
+        partialResult->get(training::init::outputOfInitForComputeStep3);
+    itemOffset = partialResult->get(training::init::offsets, (size_t)rankId);
 }
 
-void initializeModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nUsers,
-                     size_t nFactors) {
-  std::cout << "ALS (native): initializeModel " << std::endl;
+void initializeModel(size_t rankId, ccl::communicator &comm, size_t partitionId,
+                     size_t nBlocks, size_t nUsers, size_t nFactors) {
+    std::cout << "ALS (native): initializeModel " << std::endl;
 
-  auto t1 = std::chrono::high_resolution_clock::now();
+    auto t1 = std::chrono::high_resolution_clock::now();
 
-  KeyValueDataCollectionPtr initStep1LocalResult =
-      initializeStep1Local(rankId, partitionId, nBlocks, nUsers, nFactors);
+    KeyValueDataCollectionPtr initStep1LocalResult =
+        initializeStep1Local(rankId, partitionId, nBlocks, nUsers, nFactors);
 
-  /* MPI_Alltoallv to populate initStep2LocalInput */
-  ByteBuffer nodeCPs[nBlocks];
-  for (size_t i = 0; i < nBlocks; i++) {
-    serializeDAALObject((*initStep1LocalResult)[i].get(), nodeCPs[i]);
-  }
-  KeyValueDataCollectionPtr initStep2LocalInput(new KeyValueDataCollection());
-  all2all<NumericTable>(comm, nodeCPs, nBlocks, initStep2LocalInput);
+    ByteBuffer nodeCPs[nBlocks];
+    for (size_t i = 0; i < nBlocks; i++) {
+        serializeDAALObject((*initStep1LocalResult)[i].get(), nodeCPs[i]);
+    }
+    KeyValueDataCollectionPtr initStep2LocalInput(new KeyValueDataCollection());
+    all2all<NumericTable>(comm, nodeCPs, nBlocks, initStep2LocalInput);
 
-  initializeStep2Local(rankId, partitionId, initStep2LocalInput);
+    initializeStep2Local(rankId, partitionId, initStep2LocalInput);
 
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
-  std::cout << "ALS (native): initializeModel took " << duration << " secs" << std::endl;
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "ALS (native): initializeModel took " << duration << " secs"
+              << std::endl;
 }
 
 training::DistributedPartialResultStep1Ptr computeStep1Local(
-    const training::DistributedPartialResultStep4Ptr& partialResultLocal,
+    const training::DistributedPartialResultStep4Ptr &partialResultLocal,
     size_t nFactors) {
-  /* Create algorithm objects to compute implicit ALS algorithm in the distributed
-   * processing mode on the local node using the default method */
-  training::Distributed<step1Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
+    /* Create algorithm objects to compute implicit ALS algorithm in the
+     * distributed processing mode on the local node using the default method */
+    training::Distributed<step1Local> algorithm;
+    algorithm.parameter.nFactors = nFactors;
 
-  /* Set input objects for the algorithm */
-  algorithm.input.set(training::partialModel,
-                      partialResultLocal->get(training::outputOfStep4ForStep1));
+    /* Set input objects for the algorithm */
+    algorithm.input.set(
+        training::partialModel,
+        partialResultLocal->get(training::outputOfStep4ForStep1));
 
-  /* Compute partial estimates on local nodes */
-  algorithm.compute();
+    /* Compute partial estimates on local nodes */
+    algorithm.compute();
 
-  /* Get the computed partial estimates */
-  return algorithm.getPartialResult();
+    /* Get the computed partial estimates */
+    return algorithm.getPartialResult();
 }
 
 NumericTablePtr computeStep2Master(
-    const training::DistributedPartialResultStep1Ptr* step1LocalResultsOnMaster,
+    const training::DistributedPartialResultStep1Ptr *step1LocalResultsOnMaster,
     size_t nFactors, size_t nBlocks) {
-  /* Create algorithm objects to compute implicit ALS algorithm in the distributed
-   * processing mode on the master node using the default method */
-  training::Distributed<step2Master> algorithm;
-  algorithm.parameter.nFactors = nFactors;
+    /* Create algorithm objects to compute implicit ALS algorithm in the
+     * distributed processing mode on the master node using the default method
+     */
+    training::Distributed<step2Master> algorithm;
+    algorithm.parameter.nFactors = nFactors;
 
-  /* Set input objects for the algorithm */
-  for (size_t i = 0; i < nBlocks; i++) {
-    algorithm.input.add(training::inputOfStep2FromStep1, step1LocalResultsOnMaster[i]);
-  }
+    /* Set input objects for the algorithm */
+    for (size_t i = 0; i < nBlocks; i++) {
+        algorithm.input.add(training::inputOfStep2FromStep1,
+                            step1LocalResultsOnMaster[i]);
+    }
 
-  /* Compute a partial estimate on the master node from the partial estimates on local
-   * nodes */
-  algorithm.compute();
+    /* Compute a partial estimate on the master node from the partial estimates
+     * on local nodes */
+    algorithm.compute();
 
-  return algorithm.getPartialResult()->get(training::outputOfStep2ForStep4);
+    return algorithm.getPartialResult()->get(training::outputOfStep2ForStep4);
 }
 
 KeyValueDataCollectionPtr computeStep3Local(
-    const NumericTablePtr& offset,
-    const training::DistributedPartialResultStep4Ptr& partialResultLocal,
-    const KeyValueDataCollectionPtr& step3LocalInput, size_t nFactors) {
-  training::Distributed<step3Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
+    const NumericTablePtr &offset,
+    const training::DistributedPartialResultStep4Ptr &partialResultLocal,
+    const KeyValueDataCollectionPtr &step3LocalInput, size_t nFactors) {
+    training::Distributed<step3Local> algorithm;
+    algorithm.parameter.nFactors = nFactors;
 
-  algorithm.input.set(training::partialModel,
-                      partialResultLocal->get(training::outputOfStep4ForStep3));
-  algorithm.input.set(training::inputOfStep3FromInit, step3LocalInput);
-  algorithm.input.set(training::offset, offset);
+    algorithm.input.set(
+        training::partialModel,
+        partialResultLocal->get(training::outputOfStep4ForStep3));
+    algorithm.input.set(training::inputOfStep3FromInit, step3LocalInput);
+    algorithm.input.set(training::offset, offset);
 
-  algorithm.compute();
+    algorithm.compute();
 
-  return algorithm.getPartialResult()->get(training::outputOfStep3ForStep4);
+    return algorithm.getPartialResult()->get(training::outputOfStep3ForStep4);
 }
 
-training::DistributedPartialResultStep4Ptr computeStep4Local(
-    const CSRNumericTablePtr& dataTable, const NumericTablePtr& step2MasterResult,
-    const KeyValueDataCollectionPtr& step4LocalInput, size_t nFactors) {
-  training::Distributed<step4Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
+training::DistributedPartialResultStep4Ptr
+computeStep4Local(const CSRNumericTablePtr &dataTable,
+                  const NumericTablePtr &step2MasterResult,
+                  const KeyValueDataCollectionPtr &step4LocalInput,
+                  size_t nFactors) {
+    training::Distributed<step4Local> algorithm;
+    algorithm.parameter.nFactors = nFactors;
 
-  algorithm.input.set(training::partialModels, step4LocalInput);
-  algorithm.input.set(training::partialData, dataTable);
-  algorithm.input.set(training::inputOfStep4FromStep2, step2MasterResult);
+    algorithm.input.set(training::partialModels, step4LocalInput);
+    algorithm.input.set(training::partialData, dataTable);
+    algorithm.input.set(training::inputOfStep4FromStep2, step2MasterResult);
 
-  algorithm.compute();
+    algorithm.compute();
 
-  return algorithm.getPartialResult();
+    return algorithm.getPartialResult();
 }
 
-void trainModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nFactors,
-                size_t maxIterations) {
-  std::cout << "ALS (native): trainModel" << std::endl;
+void trainModel(size_t rankId, ccl::communicator &comm, size_t partitionId,
+                size_t nBlocks, size_t nFactors, size_t maxIterations) {
+    std::cout << "ALS (native): trainModel" << std::endl;
 
-  auto tStart = std::chrono::high_resolution_clock::now();
+    auto tStart = std::chrono::high_resolution_clock::now();
 
-  training::DistributedPartialResultStep1Ptr step1LocalResultsOnMaster[nBlocks];
-  training::DistributedPartialResultStep1Ptr step1LocalResult;
-  NumericTablePtr step2MasterResult;
-  KeyValueDataCollectionPtr step3LocalResult;
-  KeyValueDataCollectionPtr step4LocalInput(new KeyValueDataCollection());
+    training::DistributedPartialResultStep1Ptr
+        step1LocalResultsOnMaster[nBlocks];
+    training::DistributedPartialResultStep1Ptr step1LocalResult;
+    NumericTablePtr step2MasterResult;
+    KeyValueDataCollectionPtr step3LocalResult;
+    KeyValueDataCollectionPtr step4LocalInput(new KeyValueDataCollection());
 
-  ByteBuffer nodeCPs[nBlocks];
-  ByteBuffer nodeResults;
-  ByteBuffer crossProductBuf;
-  int crossProductLen;
+    ByteBuffer nodeCPs[nBlocks];
+    ByteBuffer nodeResults;
+    ByteBuffer crossProductBuf;
+    int crossProductLen;
 
-  for (size_t iteration = 0; iteration < maxIterations; iteration++) {
-    auto t1 = std::chrono::high_resolution_clock::now();
+    for (size_t iteration = 0; iteration < maxIterations; iteration++) {
+        auto t1 = std::chrono::high_resolution_clock::now();
 
-    //
-    // Update partial users factors
-    //
-    step1LocalResult = computeStep1Local(itemsPartialResultLocal, nFactors);
+        //
+        // Update partial users factors
+        //
+        step1LocalResult = computeStep1Local(itemsPartialResultLocal, nFactors);
 
-    serializeDAALObject(step1LocalResult.get(), nodeResults);
+        serializeDAALObject(step1LocalResult.get(), nodeResults);
 
-    /* Gathering step1LocalResult on the master */
-    gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
+        /* Gathering step1LocalResult on the master */
+        gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
 
-    if (rankId == ccl_root) {
-      step2MasterResult =
-          computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
-      serializeDAALObject(step2MasterResult.get(), crossProductBuf);
-      crossProductLen = crossProductBuf.size();
-    }    
+        if (rankId == ccl_root) {
+            step2MasterResult = computeStep2Master(step1LocalResultsOnMaster,
+                                                   nFactors, nBlocks);
+            serializeDAALObject(step2MasterResult.get(), crossProductBuf);
+            crossProductLen = crossProductBuf.size();
+        }
 
-    // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
+        ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8,
+                       ccl_root, comm)
+            .wait();
 
-    if (rankId != ccl_root) {
-      crossProductBuf.resize(crossProductLen);
-    }
-    // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
-    // MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();    
+        if (rankId != ccl_root) {
+            crossProductBuf.resize(crossProductLen);
+        }
 
-    step2MasterResult =
-        NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
+        ccl::broadcast(&crossProductBuf[0], crossProductLen,
+                       ccl::datatype::uint8, ccl_root, comm)
+            .wait();
 
-    step3LocalResult = computeStep3Local(itemOffset, itemsPartialResultLocal,
-                                         itemStep3LocalInput, nFactors);
+        step2MasterResult = NumericTable::cast(
+            deserializeDAALObject(&crossProductBuf[0], crossProductLen));
 
-    /* MPI_Alltoallv to populate step4LocalInput */
-    for (size_t i = 0; i < nBlocks; i++) {
-      serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
-    }
-    all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
+        step3LocalResult = computeStep3Local(
+            itemOffset, itemsPartialResultLocal, itemStep3LocalInput, nFactors);
 
-    usersPartialResultLocal = computeStep4Local(transposedDataTable, step2MasterResult,
-                                                step4LocalInput, nFactors);
+        for (size_t i = 0; i < nBlocks; i++) {
+            serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
+        }
+        all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
 
-    //
-    // Update partial items factors
-    //
-    step1LocalResult = computeStep1Local(usersPartialResultLocal, nFactors);
+        usersPartialResultLocal = computeStep4Local(
+            transposedDataTable, step2MasterResult, step4LocalInput, nFactors);
 
-    serializeDAALObject(step1LocalResult.get(), nodeResults);
+        //
+        // Update partial items factors
+        //
+        step1LocalResult = computeStep1Local(usersPartialResultLocal, nFactors);
 
-    /* Gathering step1LocalResult on the master */
-    gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
+        serializeDAALObject(step1LocalResult.get(), nodeResults);
 
-    if (rankId == ccl_root) {
-      step2MasterResult =
-          computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
-      serializeDAALObject(step2MasterResult.get(), crossProductBuf);
-      crossProductLen = crossProductBuf.size();
-    }
+        /* Gathering step1LocalResult on the master */
+        gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
 
-    // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
+        if (rankId == ccl_root) {
+            step2MasterResult = computeStep2Master(step1LocalResultsOnMaster,
+                                                   nFactors, nBlocks);
+            serializeDAALObject(step2MasterResult.get(), crossProductBuf);
+            crossProductLen = crossProductBuf.size();
+        }
 
-    if (rankId != ccl_root) {
-      crossProductBuf.resize(crossProductLen);
-    }
+        ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8,
+                       ccl_root, comm)
+            .wait();
 
-    // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
-    // MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();    
+        if (rankId != ccl_root) {
+            crossProductBuf.resize(crossProductLen);
+        }
 
-    step2MasterResult =
-        NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
+        ccl::broadcast(&crossProductBuf[0], crossProductLen,
+                       ccl::datatype::uint8, ccl_root, comm)
+            .wait();
 
-    step3LocalResult = computeStep3Local(userOffset, usersPartialResultLocal,
-                                         userStep3LocalInput, nFactors);
+        step2MasterResult = NumericTable::cast(
+            deserializeDAALObject(&crossProductBuf[0], crossProductLen));
 
-    /* MPI_Alltoallv to populate step4LocalInput */
-    for (size_t i = 0; i < nBlocks; i++) {
-      serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
-    }
-    all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
+        step3LocalResult = computeStep3Local(
+            userOffset, usersPartialResultLocal, userStep3LocalInput, nFactors);
 
-    itemsPartialResultLocal =
-        computeStep4Local(dataTable, step2MasterResult, step4LocalInput, nFactors);
+        /* MPI_Alltoallv to populate step4LocalInput */
+        for (size_t i = 0; i < nBlocks; i++) {
+            serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
+        }
+        all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
+
+        itemsPartialResultLocal = computeStep4Local(
+            dataTable, step2MasterResult, step4LocalInput, nFactors);
+
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+        std::cout << "ALS (native): iteration " << iteration << " took "
+                  << duration << " secs" << std::endl;
+    }
 
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
-    std::cout << "ALS (native): iteration " << iteration << " took " << duration
-              << " secs" << std::endl;
-  }
-
-  auto tEnd = std::chrono::high_resolution_clock::now();
-  auto durationTotal =
-      std::chrono::duration_cast<std::chrono::seconds>(tEnd - tStart).count();
-  std::cout << "ALS (native): trainModel took " << durationTotal << " secs" << std::endl;
-
-  /*Gather all itemsPartialResultLocal to itemsPartialResultsMaster on the master and
-   * distributing the result over other ranks*/
-  // serializeDAALObject(itemsPartialResultLocal.get(), nodeResults);
-  // gatherItems(nodeResults, nBlocks);
-
-  // serializeDAALObject(usersPartialResultLocal.get(), nodeResults);
-  // gatherUsers(nodeResults, nBlocks);
+    auto tEnd = std::chrono::high_resolution_clock::now();
+    auto durationTotal =
+        std::chrono::duration_cast<std::chrono::seconds>(tEnd - tStart).count();
+    std::cout << "ALS (native): trainModel took " << durationTotal << " secs"
+              << std::endl;
 }
 
 static size_t getOffsetFromOffsetTable(NumericTablePtr offsetTable) {
-  size_t ret;
-  BlockDescriptor<int> block;
-  offsetTable->getBlockOfRows(0, 1, readOnly, block);
-  ret = (size_t)((block.getBlockPtr())[0]);
-  offsetTable->releaseBlockOfRows(block);
+    size_t ret;
+    BlockDescriptor<int> block;
+    offsetTable->getBlockOfRows(0, 1, readOnly, block);
+    ret = (size_t)((block.getBlockPtr())[0]);
+    offsetTable->releaseBlockOfRows(block);
 
-  return ret;
+    return ret;
 }
 
 /*
@@ -525,47 +453,42 @@ static size_t getOffsetFromOffsetTable(NumericTablePtr offsetTable) {
  * Signature:
  * (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer;
  */
-JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData(
-    JNIEnv* env, jobject obj, jobject dataBuffer, jint nTotalKeys, jint nBlocks,
+JNIEXPORT jobject JNICALL
+Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData(
+    JNIEnv *env, jobject obj, jobject dataBuffer, jint nTotalKeys, jint nBlocks,
     jobject infoObj) {
-  //   cout << "cShuffleData: rank " << rankId << endl;
-  cout << "RATING_SIZE: " << RATING_SIZE << endl;
-
-  ccl::communicator &comm = getComm();
-
-  jbyte* ratingsBuf = (jbyte*)env->GetDirectBufferAddress(dataBuffer);
-
-  jlong ratingsNum = env->GetDirectBufferCapacity(dataBuffer) / RATING_SIZE;
-
-  std::vector<RatingPartition> ratingPartitions(nBlocks);
-
-  for (int i = 0; i < ratingsNum; i++) {
-    Rating* rating = (Rating*)(ratingsBuf + RATING_SIZE * i);
-    int partition = getPartiton(rating->user, nTotalKeys, nBlocks);
-    ratingPartitions[partition].push_back(*rating);
-  }
-
-  // for (int i = 0; i < nBlocks; i++) {
-  //   cout << "Partition " << i << endl;
-  //   for (auto r : ratingPartitions[i]) {
-  //     cout << r.user << " " << r.item << " " << r.rating << endl;
-  //   }
-  // }
-
-  size_t newRatingsNum = 0;
-  size_t newCsrRowNum = 0;
-  Rating* ratings = shuffle_all2all(comm, ratingPartitions, nBlocks, newRatingsNum, newCsrRowNum);
-
-  // Get the class of the input object
-  jclass clazz = env->GetObjectClass(infoObj);
-  // Get Field references
-  jfieldID ratingsNumField = env->GetFieldID(clazz, "ratingsNum", "I");
-  jfieldID csrRowNumField = env->GetFieldID(clazz, "csrRowNum", "I");    
-  
-  env->SetIntField(infoObj, ratingsNumField, newRatingsNum);
-  env->SetIntField(infoObj, csrRowNumField, newCsrRowNum);
-  
-  return env->NewDirectByteBuffer(ratings, newRatingsNum*RATING_SIZE);
+    //   cout << "cShuffleData: rank " << rankId << endl;
+    cout << "RATING_SIZE: " << RATING_SIZE << endl;
+
+    ccl::communicator &comm = getComm();
+
+    jbyte *ratingsBuf = (jbyte *)env->GetDirectBufferAddress(dataBuffer);
+
+    jlong ratingsNum = env->GetDirectBufferCapacity(dataBuffer) / RATING_SIZE;
+
+    std::vector<RatingPartition> ratingPartitions(nBlocks);
+
+    for (int i = 0; i < ratingsNum; i++) {
+        Rating *rating = (Rating *)(ratingsBuf + RATING_SIZE * i);
+        int partition = getPartiton(rating->user, nTotalKeys, nBlocks);
+        ratingPartitions[partition].push_back(*rating);
+    }
+
+    size_t newRatingsNum = 0;
+    size_t newCsrRowNum = 0;
+    Rating *ratings = shuffle_all2all(comm, ratingPartitions, nBlocks,
+                                      newRatingsNum, newCsrRowNum);
+
+    // Get the class of the input object
+    jclass clazz = env->GetObjectClass(infoObj);
+    // Get Field references
+    jfieldID ratingsNumField = env->GetFieldID(clazz, "ratingsNum", "I");
+    jfieldID csrRowNumField = env->GetFieldID(clazz, "csrRowNum", "I");
+
+    env->SetIntField(infoObj, ratingsNumField, newRatingsNum);
+    env->SetIntField(infoObj, csrRowNumField, newCsrRowNum);
+
+    return env->NewDirectByteBuffer(ratings, newRatingsNum * RATING_SIZE);
 }
 
 /*
@@ -574,95 +497,80 @@ JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cSh
  * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J
  */
 
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS(
-    JNIEnv* env, jobject obj, jlong numTableAddr, jlong nUsers, jint nFactors,
-    jint maxIter, jdouble regParam, jdouble alpha, jint executor_num, jint executor_cores,
-    jint partitionId, jobject resultObj) {
-
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
-
-  dataTable = *((CSRNumericTablePtr*)numTableAddr);
-  // dataTable.reset(createFloatSparseTable("/home/xiaochang/github/oneDAL-upstream/samples/daal/cpp/mpi/data/distributed/implicit_als_csr_1.csv"));
-
-  // printNumericTable(dataTable, "cDALImplictALS", 10);
-  cout << "ALS (native): Input info: " << endl;
-  cout << "- NumberOfRows: " << dataTable->getNumberOfRows() << endl;
-  cout << "- NumberOfColumns: " << dataTable->getNumberOfColumns() << endl;
-  cout << "- NumberOfRatings: " << dataTable->getDataSize() << endl;
-  cout << "- fullNUsers: " << nUsers << endl;
-  cout << "- nFactors: " << nFactors << endl;
-
-  // Set number of threads for oneDAL to use for each rank
-  services::Environment::getInstance()->setNumberOfThreads(executor_cores);
-  int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
-  cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
-
-  int nBlocks = executor_num;
-  initializeModel(rankId, comm, partitionId, nBlocks, nUsers, nFactors);
-  trainModel(rankId, comm, partitionId, executor_num, nFactors, maxIter);
-
-  auto pUser =
-      usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
-  // auto pUserIndices =
-  // usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getIndices();
-  auto pItem =
-      itemsPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
-  // auto pItemIndices =
-  // itemsPartialResultsMaster[i]->get(training::outputOfStep4ForStep1)->getIndices();
-
-  std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl;
-  // std::cout << "Partition ID: " << partitionId << std::endl;
-  printNumericTable(pUser, "User Factors (first 10 rows x 20 columns):", 10, 20);
-  printNumericTable(pItem, "Item Factors (first 10 rows x 20 columns):", 10, 20);
-  std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset) << std::endl;
-  std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset) << std::endl;
-  std::cout << std::endl;
-
-  // printNumericTable(userOffset, "userOffset");
-  // printNumericTable(itemOffset, "itemOffset");
-
-  // if (rankId == ccl_root) {
-  //     for (int i = 0; i < nBlocks; i++) {
-  //         printNumericTable(NumericTable::cast((*userOffsetsOnMaster)[i]),
-  //         "userOffsetsOnMaster");
-  //     }
-
-  //     for (int i = 0; i < nBlocks; i++) {
-  //         printNumericTable(NumericTable::cast((*itemOffsetsOnMaster)[i]),
-  //         "itemOffsetsOnMaster");
-  //     }
-  // }
-
-  //    printf("native pUser %ld, pItem %ld", (jlong)&pUser, (jlong)&pItem);
-
-  // Get the class of the input object
-  jclass clazz = env->GetObjectClass(resultObj);
-
-  // Fill in rankId
-  jfieldID cRankIdField = env->GetFieldID(clazz, "rankId", "J");
-  env->SetLongField(resultObj, cRankIdField, (jlong)rankId);
-
-  // Fill in cUsersFactorsNumTab & cItemsFactorsNumTab
-  // Get Field references
-  jfieldID cUsersFactorsNumTabField = env->GetFieldID(clazz, "cUsersFactorsNumTab", "J");
-  jfieldID cItemsFactorsNumTabField = env->GetFieldID(clazz, "cItemsFactorsNumTab", "J");
-  // Set factors as result, should use heap memory
-  NumericTablePtr* retUser = new NumericTablePtr(pUser);
-  NumericTablePtr* retItem = new NumericTablePtr(pItem);
-  env->SetLongField(resultObj, cUsersFactorsNumTabField, (jlong)retUser);
-  env->SetLongField(resultObj, cItemsFactorsNumTabField, (jlong)retItem);
-
-  // Fill in cUserOffset & cItemOffset
-  jfieldID cUserOffsetField = env->GetFieldID(clazz, "cUserOffset", "J");
-  assert(cUserOffsetField != NULL);
-  env->SetLongField(resultObj, cUserOffsetField,
-                    (jlong)getOffsetFromOffsetTable(userOffset));
-
-  jfieldID cItemOffsetField = env->GetFieldID(clazz, "cItemOffset", "J");
-  assert(cItemOffsetField != NULL);
-  env->SetLongField(resultObj, cItemOffsetField,
-                    (jlong)getOffsetFromOffsetTable(itemOffset));
-
-  return 0;
+JNIEXPORT jlong JNICALL
+Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS(
+    JNIEnv *env, jobject obj, jlong numTableAddr, jlong nUsers, jint nFactors,
+    jint maxIter, jdouble regParam, jdouble alpha, jint executor_num,
+    jint executor_cores, jint partitionId, jobject resultObj) {
+
+    ccl::communicator &comm = getComm();
+    size_t rankId = comm.rank();
+
+    dataTable = *((CSRNumericTablePtr *)numTableAddr);
+
+    cout << "ALS (native): Input info: " << endl;
+    cout << "- NumberOfRows: " << dataTable->getNumberOfRows() << endl;
+    cout << "- NumberOfColumns: " << dataTable->getNumberOfColumns() << endl;
+    cout << "- NumberOfRatings: " << dataTable->getDataSize() << endl;
+    cout << "- fullNUsers: " << nUsers << endl;
+    cout << "- nFactors: " << nFactors << endl;
+
+    // Set number of threads for oneDAL to use for each rank
+    services::Environment::getInstance()->setNumberOfThreads(executor_cores);
+    int nThreadsNew =
+        services::Environment::getInstance()->getNumberOfThreads();
+    cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew
+         << endl;
+
+    int nBlocks = executor_num;
+    initializeModel(rankId, comm, partitionId, nBlocks, nUsers, nFactors);
+    trainModel(rankId, comm, partitionId, executor_num, nFactors, maxIter);
+
+    auto pUser = usersPartialResultLocal->get(training::outputOfStep4ForStep1)
+                     ->getFactors();
+    auto pItem = itemsPartialResultLocal->get(training::outputOfStep4ForStep1)
+                     ->getFactors();
+
+    std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl;
+    printNumericTable(pUser, "User Factors (first 10 rows x 20 columns):", 10,
+                      20);
+    printNumericTable(pItem, "Item Factors (first 10 rows x 20 columns):", 10,
+                      20);
+    std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset)
+              << std::endl;
+    std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset)
+              << std::endl;
+    std::cout << std::endl;
+
+    // Get the class of the input object
+    jclass clazz = env->GetObjectClass(resultObj);
+
+    // Fill in rankId
+    jfieldID cRankIdField = env->GetFieldID(clazz, "rankId", "J");
+    env->SetLongField(resultObj, cRankIdField, (jlong)rankId);
+
+    // Fill in cUsersFactorsNumTab & cItemsFactorsNumTab
+    // Get Field references
+    jfieldID cUsersFactorsNumTabField =
+        env->GetFieldID(clazz, "cUsersFactorsNumTab", "J");
+    jfieldID cItemsFactorsNumTabField =
+        env->GetFieldID(clazz, "cItemsFactorsNumTab", "J");
+    // Set factors as result, should use heap memory
+    NumericTablePtr *retUser = new NumericTablePtr(pUser);
+    NumericTablePtr *retItem = new NumericTablePtr(pItem);
+    env->SetLongField(resultObj, cUsersFactorsNumTabField, (jlong)retUser);
+    env->SetLongField(resultObj, cItemsFactorsNumTabField, (jlong)retItem);
+
+    // Fill in cUserOffset & cItemOffset
+    jfieldID cUserOffsetField = env->GetFieldID(clazz, "cUserOffset", "J");
+    assert(cUserOffsetField != NULL);
+    env->SetLongField(resultObj, cUserOffsetField,
+                      (jlong)getOffsetFromOffsetTable(userOffset));
+
+    jfieldID cItemOffsetField = env->GetFieldID(clazz, "cItemOffset", "J");
+    assert(cItemOffsetField != NULL);
+    env->SetLongField(resultObj, cItemOffsetField,
+                      (jlong)getOffsetFromOffsetTable(itemOffset));
+
+    return 0;
 }
diff --git a/mllib-dal/src/main/native/ALSShuffle.cpp b/mllib-dal/src/main/native/ALSShuffle.cpp
index 73440d253..313a0c393 100644
--- a/mllib-dal/src/main/native/ALSShuffle.cpp
+++ b/mllib-dal/src/main/native/ALSShuffle.cpp
@@ -1,9 +1,25 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#include <algorithm>
 #include <cstring>
 #include <iostream>
-#include <vector>
-#include <algorithm>
-#include <set>
 #include <oneapi/ccl.hpp>
+#include <set>
+#include <vector>
 
 #include "ALSShuffle.h"
 
@@ -13,19 +29,18 @@ std::vector<Rating> recvData;
 
 jlong getPartiton(jlong key, jlong totalKeys, long nBlocks) {
 
-  jlong itemsInBlock = totalKeys / nBlocks;
+    jlong itemsInBlock = totalKeys / nBlocks;
 
-  return min(key / itemsInBlock, nBlocks - 1);
+    return min(key / itemsInBlock, nBlocks - 1);
 }
 
-// Compares two Rating according to userId. 
-bool compareRatingByUser(Rating r1, Rating r2)
-{ 
-  if (r1.user < r2.user)
-    return true;
-  if (r1.user == r2.user && r1.item < r2.item)  
-    return true;
-  return false;
+// Compares two Rating according to userId.
+bool compareRatingByUser(Rating r1, Rating r2) {
+    if (r1.user < r2.user)
+        return true;
+    if (r1.user == r2.user && r1.item < r2.item)
+        return true;
+    return false;
 }
 
 bool compareRatingUserEquality(Rating &r1, Rating &r2) {
@@ -33,70 +48,80 @@ bool compareRatingUserEquality(Rating &r1, Rating &r2) {
 }
 
 int distinct_count(std::vector<Rating> &data) {
-  long curUser = -1;
-  long count = 0;
-  for (auto i : data) {
-    if (i.user > curUser) {
-      curUser = i.user;
-      count += 1;
-    }    
-  }
-  return count;
+    long curUser = -1;
+    long count = 0;
+    for (auto i : data) {
+        if (i.user > curUser) {
+            curUser = i.user;
+            count += 1;
+        }
+    }
+    return count;
 }
 
-Rating * shuffle_all2all(ccl::communicator &comm, std::vector<RatingPartition> &partitions, size_t nBlocks, size_t &newRatingsNum, size_t &newCsrRowNum) {
-  size_t sendBufSize = 0;
-  size_t recvBufSize = 0;
-  vector<size_t> perNodeSendLens(nBlocks);
-  vector<size_t> perNodeRecvLens(nBlocks);
-
-  ByteBuffer sendData;  
-
-  // Calculate send buffer size
-  for (size_t i = 0; i < nBlocks; i++) {      
-      perNodeSendLens[i] = partitions[i].size() * RATING_SIZE;
-      // cout << "rank " << rankId << " Send partition " << i << " size " << perNodeSendLens[i] << endl;
-      sendBufSize += perNodeSendLens[i];
-  }
-  cout << "sendData size " << sendBufSize << endl;
-  sendData.resize(sendBufSize);
-
-  // Fill in send buffer
-  size_t offset = 0;
-  for (size_t i = 0; i < nBlocks; i++)
-  {
-    memcpy(sendData.data()+offset, partitions[i].data(), perNodeSendLens[i]);
-    offset += perNodeSendLens[i];
-  }
-
-  // Send lens first  
-  ccl::alltoall(perNodeSendLens.data(), perNodeRecvLens.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
-
-  // Calculate recv buffer size
-  for (size_t i = 0; i < nBlocks; i++) {
-      // cout << "rank " << rankId << " Recv partition " << i << " size " << perNodeRecvLens[i] << endl;
-      recvBufSize += perNodeRecvLens[i];
-  }  
-
-  int ratingsNum = recvBufSize / RATING_SIZE;
-  recvData.resize(ratingsNum);
-
-  // Send data
-  ccl::alltoallv(sendData.data(), perNodeSendLens, recvData.data(), perNodeRecvLens, ccl::datatype::uint8, comm).wait();  
-
-  sort(recvData.begin(), recvData.end(), compareRatingByUser);
-
-  // for (auto r : recvData) {
-  //   cout << r.user << " " << r.item << " " << r.rating << endl;
-  // }
-
-  newRatingsNum = recvData.size();
-  // RatingPartition::iterator iter = std::unique(recvData.begin(), recvData.end(), compareRatingUserEquality);
-  // newCsrRowNum = std::distance(recvData.begin(), iter);
-  newCsrRowNum = distinct_count(recvData);
-
-  cout << "newRatingsNum: " << newRatingsNum << " newCsrRowNum: " << newCsrRowNum << endl;
-
-  return recvData.data();
+Rating *shuffle_all2all(ccl::communicator &comm,
+                        std::vector<RatingPartition> &partitions,
+                        size_t nBlocks, size_t &newRatingsNum,
+                        size_t &newCsrRowNum) {
+    size_t sendBufSize = 0;
+    size_t recvBufSize = 0;
+    vector<size_t> perNodeSendLens(nBlocks);
+    vector<size_t> perNodeRecvLens(nBlocks);
+
+    ByteBuffer sendData;
+
+    // Calculate send buffer size
+    for (size_t i = 0; i < nBlocks; i++) {
+        perNodeSendLens[i] = partitions[i].size() * RATING_SIZE;
+        // cout << "rank " << rankId << " Send partition " << i << " size " <<
+        // perNodeSendLens[i] << endl;
+        sendBufSize += perNodeSendLens[i];
+    }
+    cout << "sendData size " << sendBufSize << endl;
+    sendData.resize(sendBufSize);
+
+    // Fill in send buffer
+    size_t offset = 0;
+    for (size_t i = 0; i < nBlocks; i++) {
+        memcpy(sendData.data() + offset, partitions[i].data(),
+               perNodeSendLens[i]);
+        offset += perNodeSendLens[i];
+    }
+
+    // Send lens first
+    ccl::alltoall(perNodeSendLens.data(), perNodeRecvLens.data(),
+                  sizeof(size_t), ccl::datatype::uint8, comm)
+        .wait();
+
+    // Calculate recv buffer size
+    for (size_t i = 0; i < nBlocks; i++) {
+        // cout << "rank " << rankId << " Recv partition " << i << " size " <<
+        // perNodeRecvLens[i] << endl;
+        recvBufSize += perNodeRecvLens[i];
+    }
+
+    int ratingsNum = recvBufSize / RATING_SIZE;
+    recvData.resize(ratingsNum);
+
+    // Send data
+    ccl::alltoallv(sendData.data(), perNodeSendLens, recvData.data(),
+                   perNodeRecvLens, ccl::datatype::uint8, comm)
+        .wait();
+
+    sort(recvData.begin(), recvData.end(), compareRatingByUser);
+
+    // for (auto r : recvData) {
+    //   cout << r.user << " " << r.item << " " << r.rating << endl;
+    // }
+
+    newRatingsNum = recvData.size();
+    // RatingPartition::iterator iter = std::unique(recvData.begin(),
+    // recvData.end(), compareRatingUserEquality); newCsrRowNum =
+    // std::distance(recvData.begin(), iter);
+    newCsrRowNum = distinct_count(recvData);
+
+    cout << "newRatingsNum: " << newRatingsNum
+         << " newCsrRowNum: " << newCsrRowNum << endl;
+
+    return recvData.data();
 }
-
diff --git a/mllib-dal/src/main/native/ALSShuffle.h b/mllib-dal/src/main/native/ALSShuffle.h
index dbe864978..62238586a 100644
--- a/mllib-dal/src/main/native/ALSShuffle.h
+++ b/mllib-dal/src/main/native/ALSShuffle.h
@@ -1,11 +1,28 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
 #pragma once
 
 #include <jni.h>
+#include <oneapi/ccl.hpp>
 
 struct Rating {
-  jlong user;
-  jlong item;
-  jfloat rating;
+    jlong user;
+    jlong item;
+    jfloat rating;
 } __attribute__((packed));
 
 const int RATING_SIZE = sizeof(Rating);
@@ -14,4 +31,6 @@ typedef std::vector<unsigned char> ByteBuffer;
 typedef std::vector<Rating> RatingPartition;
 
 jlong getPartiton(jlong key, jlong totalKeys, long nBlocks);
-Rating * shuffle_all2all(ccl::communicator &comm, std::vector<RatingPartition> &partitions, size_t nBlocks, size_t &ratingsNum, size_t &csrRowNum);
+Rating *shuffle_all2all(ccl::communicator &comm,
+                        std::vector<RatingPartition> &partitions,
+                        size_t nBlocks, size_t &ratingsNum, size_t &csrRowNum);
diff --git a/mllib-dal/src/main/native/KMeansDALImpl.cpp b/mllib-dal/src/main/native/KMeansDALImpl.cpp
index d9c7a2f29..db8db80b1 100644
--- a/mllib-dal/src/main/native/KMeansDALImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansDALImpl.cpp
@@ -1,27 +1,27 @@
 /*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
-#include <oneapi/ccl.hpp>
+#include <chrono>
 #include <daal.h>
 #include <iostream>
-#include <chrono>
+#include <oneapi/ccl.hpp>
 
-#include "service.h"
-#include "org_apache_spark_ml_clustering_KMeansDALImpl.h"
 #include "OneCCL.h"
+#include "org_apache_spark_ml_clustering_KMeansDALImpl.h"
+#include "service.h"
 
 using namespace std;
 using namespace daal;
@@ -31,30 +31,36 @@ const int ccl_root = 0;
 
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
-static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, 
-                                      const NumericTablePtr & pData, const NumericTablePtr & initialCentroids,
-    size_t nClusters, size_t nBlocks, algorithmFPType &ret_cost)
-{
-    const bool isRoot          = (rankId == ccl_root);
+static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
+                                      const NumericTablePtr &pData,
+                                      const NumericTablePtr &initialCentroids,
+                                      size_t nClusters, size_t nBlocks,
+                                      algorithmFPType &ret_cost) {
+    const bool isRoot = (rankId == ccl_root);
     size_t CentroidsArchLength = 0;
     InputDataArchive inputArch;
-    if (isRoot)
-    {
-        /*Retrieve the algorithm results and serialize them */
+    if (isRoot) {
+        /* Retrieve the algorithm results and serialize them */
         initialCentroids->serialize(inputArch);
         CentroidsArchLength = inputArch.getSizeOfArchive();
     }
 
     /* Get partial results from the root node */
-    ccl::broadcast(&CentroidsArchLength, sizeof(size_t), ccl::datatype::uint8, ccl_root, comm).wait();
+    ccl::broadcast(&CentroidsArchLength, sizeof(size_t), ccl::datatype::uint8,
+                   ccl_root, comm)
+        .wait();
 
     ByteBuffer nodeCentroids(CentroidsArchLength);
-    if (isRoot) inputArch.copyArchiveToArray(&nodeCentroids[0], CentroidsArchLength);
+    if (isRoot)
+        inputArch.copyArchiveToArray(&nodeCentroids[0], CentroidsArchLength);
 
-    ccl::broadcast(&nodeCentroids[0], CentroidsArchLength, ccl::datatype::uint8, ccl_root, comm).wait();
+    ccl::broadcast(&nodeCentroids[0], CentroidsArchLength, ccl::datatype::uint8,
+                   ccl_root, comm)
+        .wait();
 
     /* Deserialize centroids data */
-    OutputDataArchive outArch(nodeCentroids.size() ? &nodeCentroids[0] : NULL, CentroidsArchLength);
+    OutputDataArchive outArch(nodeCentroids.size() ? &nodeCentroids[0] : NULL,
+                              CentroidsArchLength);
 
     NumericTablePtr centroids(new HomogenNumericTable<algorithmFPType>());
 
@@ -76,10 +82,10 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     size_t perNodeArchLength = dataArch.getSizeOfArchive();
     ByteBuffer serializedData;
 
-    /* Serialized data is of equal size on each node if each node called compute() equal number of times */
+    /* Serialized data is of equal size on each node if each node called
+     * compute() equal number of times */
     vector<size_t> recvCounts(nBlocks);
-    for (size_t i = 0; i < nBlocks; i++)
-    {
+    for (size_t i = 0; i < nBlocks; i++) {
         recvCounts[i] = perNodeArchLength;
     }
     serializedData.resize(perNodeArchLength * nBlocks);
@@ -88,30 +94,37 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     dataArch.copyArchiveToArray(&nodeResults[0], perNodeArchLength);
 
     /* Transfer partial results to step 2 on the root node */
-    ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl::datatype::uint8, comm).wait();
+    ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
+                    recvCounts, ccl::datatype::uint8, comm)
+        .wait();
 
-    if (isRoot)
-    {
+    if (isRoot) {
         /* Create an algorithm to compute k-means on the master node */
-        kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(nClusters);
+        kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
+            nClusters);
 
-        for (size_t i = 0; i < nBlocks; i++)
-        {
+        for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
-            OutputDataArchive dataArch(&serializedData[perNodeArchLength * i], perNodeArchLength);
+            OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
+                                       perNodeArchLength);
 
-            kmeans::PartialResultPtr dataForStep2FromStep1(new kmeans::PartialResult());
+            kmeans::PartialResultPtr dataForStep2FromStep1(
+                new kmeans::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
-            /* Set local partial results as input for the master-node algorithm */
-            masterAlgorithm.input.add(kmeans::partialResults, dataForStep2FromStep1);
+            /* Set local partial results as input for the master-node algorithm
+             */
+            masterAlgorithm.input.add(kmeans::partialResults,
+                                      dataForStep2FromStep1);
         }
 
         /* Merge and finalizeCompute k-means on the master node */
         masterAlgorithm.compute();
         masterAlgorithm.finalizeCompute();
 
-        ret_cost = masterAlgorithm.getResult()->get(kmeans::objectiveFunction)->getValue<algorithmFPType>(0, 0);
+        ret_cost = masterAlgorithm.getResult()
+                       ->get(kmeans::objectiveFunction)
+                       ->getValue<algorithmFPType>(0, 0);
 
         /* Retrieve the algorithm results */
         return masterAlgorithm.getResult()->get(kmeans::centroids);
@@ -119,7 +132,9 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     return NumericTablePtr();
 }
 
-static bool isCenterConverged(const algorithmFPType *oldCenter, const algorithmFPType *newCenter, size_t dim, double tolerance) {
+static bool isCenterConverged(const algorithmFPType *oldCenter,
+                              const algorithmFPType *newCenter, size_t dim,
+                              double tolerance) {
 
     algorithmFPType sums = 0.0;
 
@@ -129,7 +144,9 @@ static bool isCenterConverged(const algorithmFPType *oldCenter, const algorithmF
     return sums <= tolerance * tolerance;
 }
 
-static bool areAllCentersConverged(const NumericTablePtr & oldCenters, const NumericTablePtr &newCenters, double tolerance) {
+static bool areAllCentersConverged(const NumericTablePtr &oldCenters,
+                                   const NumericTablePtr &newCenters,
+                                   double tolerance) {
     size_t rows = oldCenters->getNumberOfRows();
     size_t cols = oldCenters->getNumberOfColumns();
 
@@ -142,9 +159,8 @@ static bool areAllCentersConverged(const NumericTablePtr & oldCenters, const Num
     algorithmFPType *arrayNewCenters = blockNewCenters.getBlockPtr();
 
     for (size_t i = 0; i < rows; i++) {
-        if (!isCenterConverged(&arrayOldCenters[i*cols],
-                               &arrayNewCenters[i*cols],
-                               cols, tolerance))
+        if (!isCenterConverged(&arrayOldCenters[i * cols],
+                               &arrayNewCenters[i * cols], cols, tolerance))
             return false;
     }
 
@@ -156,69 +172,78 @@ static bool areAllCentersConverged(const NumericTablePtr & oldCenters, const Num
  * Method:    cKMeansDALComputeWithInitCenters
  * Signature: (JJIDIIILorg/apache/spark/ml/clustering/KMeansResult;)J
  */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMeansDALComputeWithInitCenters
-  (JNIEnv *env, jobject obj,
-  jlong pNumTabData, jlong pNumTabCenters,
-  jint cluster_num, jdouble tolerance, jint iteration_num,
-  jint executor_num, jint executor_cores,
-  jobject resultObj) {
+JNIEXPORT jlong JNICALL
+Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMeansDALComputeWithInitCenters(
+    JNIEnv *env, jobject obj, jlong pNumTabData, jlong pNumTabCenters,
+    jint cluster_num, jdouble tolerance, jint iteration_num, jint executor_num,
+    jint executor_cores, jobject resultObj) {
 
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
+    ccl::communicator &comm = getComm();
+    size_t rankId = comm.rank();
 
-  NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
-  NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters);
+    NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
+    NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters);
 
-  // Set number of threads for oneDAL to use for each rank
-  services::Environment::getInstance()->setNumberOfThreads(executor_cores);
+    // Set number of threads for oneDAL to use for each rank
+    services::Environment::getInstance()->setNumberOfThreads(executor_cores);
 
-  int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
-  cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
+    int nThreadsNew =
+        services::Environment::getInstance()->getNumberOfThreads();
+    cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew
+         << endl;
 
-  algorithmFPType totalCost;
+    algorithmFPType totalCost;
 
-  NumericTablePtr newCentroids;
-  bool converged = false;
+    NumericTablePtr newCentroids;
+    bool converged = false;
 
-  int it = 0;
-  for (it = 0; it < iteration_num && !converged; it++) {
-    auto t1 = std::chrono::high_resolution_clock::now();
+    int it = 0;
+    for (it = 0; it < iteration_num && !converged; it++) {
+        auto t1 = std::chrono::high_resolution_clock::now();
 
-    newCentroids = kmeans_compute(rankId, comm, pData, centroids, cluster_num, executor_num, totalCost);
+        newCentroids = kmeans_compute(rankId, comm, pData, centroids,
+                                      cluster_num, executor_num, totalCost);
 
-    if (rankId == ccl_root) {
-        converged = areAllCentersConverged(centroids, newCentroids, tolerance);
+        if (rankId == ccl_root) {
+            converged =
+                areAllCentersConverged(centroids, newCentroids, tolerance);
+        }
+
+        // Sync converged status
+        ccl::broadcast(&converged, 1, ccl::datatype::uint8, ccl_root, comm)
+            .wait();
+
+        centroids = newCentroids;
+
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+        std::cout << "KMeans (native): iteration " << it << " took " << duration
+                  << " secs" << std::endl;
     }
 
-    // Sync converged status
-    ccl::broadcast(&converged, 1, ccl::datatype::uint8, ccl_root, comm).wait();
-
-    centroids = newCentroids;
-
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-    std::cout << "KMeans (native): iteration " << it << " took " << duration << " secs" << std::endl;
-  }
-
-  if (rankId == ccl_root) {
-    if (it == iteration_num)
-        std::cout << "KMeans (native): reached " << iteration_num << " max iterations." << std::endl;
-    else
-        std::cout << "KMeans (native): converged in " << it << " iterations." << std::endl;
-
-    // Get the class of the input object
-    jclass clazz = env->GetObjectClass(resultObj);
-    // Get Field references
-    jfieldID totalCostField = env->GetFieldID(clazz, "totalCost", "D");
-    jfieldID iterationNumField = env->GetFieldID(clazz, "iterationNum", "I");
-
-    // Set iteration num for result
-    env->SetIntField(resultObj, iterationNumField, it);
-    // Set cost for result
-    env->SetDoubleField(resultObj, totalCostField, totalCost);   
-
-    NumericTablePtr *ret = new NumericTablePtr(centroids);
-    return (jlong)ret;
-  } else
-    return (jlong)0;
-}
\ No newline at end of file
+    if (rankId == ccl_root) {
+        if (it == iteration_num)
+            std::cout << "KMeans (native): reached " << iteration_num
+                      << " max iterations." << std::endl;
+        else
+            std::cout << "KMeans (native): converged in " << it
+                      << " iterations." << std::endl;
+
+        // Get the class of the input object
+        jclass clazz = env->GetObjectClass(resultObj);
+        // Get Field references
+        jfieldID totalCostField = env->GetFieldID(clazz, "totalCost", "D");
+        jfieldID iterationNumField =
+            env->GetFieldID(clazz, "iterationNum", "I");
+
+        // Set iteration num for result
+        env->SetIntField(resultObj, iterationNumField, it);
+        // Set cost for result
+        env->SetDoubleField(resultObj, totalCostField, totalCost);
+
+        NumericTablePtr *ret = new NumericTablePtr(centroids);
+        return (jlong)ret;
+    } else
+        return (jlong)0;
+}
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index e3a7e2161..06b8e0c13 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -21,7 +21,6 @@ CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11
 # The following paths setting works for self-built libs from source code
 # https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used,
 # Should change paths to ${CCL_ROOT}/{include,lib}/cpu_icc instead
-
 INCS := -I $(JAVA_HOME)/include \
         -I $(JAVA_HOME)/include/linux \
         -I ${CCL_ROOT}/include \
@@ -30,18 +29,21 @@ INCS := -I $(JAVA_HOME)/include \
         -I ./
 
 # Use static link if possible, TBB is only available as dynamic libs
-
 LIBS := -L${CCL_ROOT}/lib -lccl \
         -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \
         -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc
-#        TODO: Add signal chaining support, should fix linking, package so and loading
-#        -L$(JAVA_HOME)/jre/lib/amd64 -ljsig
 
 CPP_SRCS += \
-./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./ALSDALImpl.cpp ./ALSShuffle.cpp ./service.cpp ./error_handling.cpp
+  ./OneCCL.cpp ./OneDAL.cpp ./service.cpp ./error_handling.cpp \
+  ./KMeansDALImpl.cpp \
+  ./PCADALImpl.cpp \
+  ./ALSDALImpl.cpp ./ALSShuffle.cpp
 
 OBJS += \
-./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./ALSDALImpl.o ./ALSShuffle.o ./service.o ./error_handling.o
+  ./OneCCL.o ./OneDAL.o ./service.o ./error_handling.o \
+  ./KMeansDALImpl.o \
+  ./PCADALImpl.o \
+  ./ALSDALImpl.o ./ALSShuffle.o
 
 # Output Binary
 OUTPUT = ../../../target/libMLlibDAL.so
diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
index c733c7b33..9b1aa6e2c 100644
--- a/mllib-dal/src/main/native/OneCCL.cpp
+++ b/mllib-dal/src/main/native/OneCCL.cpp
@@ -1,65 +1,88 @@
-#include <iostream>
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
 #include <chrono>
+#include <iostream>
 
 #include <arpa/inet.h>
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
-#include <list>
 #include <ifaddrs.h>
+#include <list>
 #include <netdb.h>
 
 #include <oneapi/ccl.hpp>
 
+#include "OneCCL.h"
 #include "org_apache_spark_ml_util_OneCCL__.h"
 
-// todo: fill initial comm_size and rank_id
-size_t comm_size;
-size_t rank_id;
+static const int CCL_IP_LEN = 128;
+static std::list<std::string> local_host_ips;
+static size_t comm_size = 0;
+static size_t rank_id = 0;
+static std::vector<ccl::communicator> g_comms;
 
-std::vector<ccl::communicator> g_comms;
+ccl::communicator &getComm() { return g_comms[0]; }
 
-ccl::communicator &getComm() {
-    return g_comms[0];
-}
+/*
+ * Class:     org_apache_spark_ml_util_OneCCL__
+ * Method:    c_init
+ * Signature: (IILjava/lang/String;Lorg/apache/spark/ml/util/CCLParam;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init(
+    JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port,
+    jobject param) {
 
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
-  (JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jobject param) {
-  
-  std::cerr << "OneCCL (native): init" << std::endl;
+    std::cerr << "OneCCL (native): init" << std::endl;
 
-  auto t1 = std::chrono::high_resolution_clock::now();
+    auto t1 = std::chrono::high_resolution_clock::now();
 
-  ccl::init();
+    ccl::init();
 
-  const char *str = env->GetStringUTFChars(ip_port, 0);
-  ccl::string ccl_ip_port(str);
+    const char *str = env->GetStringUTFChars(ip_port, 0);
+    ccl::string ccl_ip_port(str);
 
-  auto kvs_attr = ccl::create_kvs_attr();
-  kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);
+    auto kvs_attr = ccl::create_kvs_attr();
+    kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);
 
-  ccl::shared_ptr_class<ccl::kvs> kvs;
-  kvs = ccl::create_main_kvs(kvs_attr);
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    kvs = ccl::create_main_kvs(kvs_attr);
 
-  g_comms.push_back(ccl::create_communicator(size, rank, kvs));
+    g_comms.push_back(ccl::create_communicator(size, rank, kvs));
 
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cerr << "OneCCL (native): init took " << duration << " secs" << std::endl;
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cerr << "OneCCL (native): init took " << duration << " secs"
+              << std::endl;
 
-  rank_id = getComm().rank();
-  comm_size = getComm().size();
+    rank_id = getComm().rank();
+    comm_size = getComm().size();
 
-  jclass cls = env->GetObjectClass(param);
-  jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
-  jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");  
+    jclass cls = env->GetObjectClass(param);
+    jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
+    jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");
 
-  env->SetLongField(param, fid_comm_size, comm_size);
-  env->SetLongField(param, fid_rank_id, rank_id);    
-  env->ReleaseStringUTFChars(ip_port, str);
+    env->SetLongField(param, fid_comm_size, comm_size);
+    env->SetLongField(param, fid_rank_id, rank_id);
+    env->ReleaseStringUTFChars(ip_port, str);
 
-  return 1;
+    return 1;
 }
 
 /*
@@ -67,13 +90,12 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
  * Method:    c_cleanup
  * Signature: ()V
  */
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
-  (JNIEnv *env, jobject obj) {
-
-  g_comms.pop_back();
+JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup(
+    JNIEnv *env, jobject obj) {
 
-  std::cerr << "OneCCL (native): cleanup" << std::endl;
+    g_comms.pop_back();
 
+    std::cerr << "OneCCL (native): cleanup" << std::endl;
 }
 
 /*
@@ -81,8 +103,8 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
  * Method:    isRoot
  * Signature: ()Z
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
-  (JNIEnv *env, jobject obj) {    
+JNIEXPORT jboolean JNICALL
+Java_org_apache_spark_ml_util_OneCCL_00024_isRoot(JNIEnv *env, jobject obj) {
 
     return getComm().rank() == 0;
 }
@@ -92,8 +114,8 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
  * Method:    rankID
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
-  (JNIEnv *env, jobject obj) {
+JNIEXPORT jint JNICALL
+Java_org_apache_spark_ml_util_OneCCL_00024_rankID(JNIEnv *env, jobject obj) {
     return getComm().rank();
 }
 
@@ -102,11 +124,11 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
  * Method:    setEnv
  * Signature: (Ljava/lang/String;Ljava/lang/String;Z)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
-  (JNIEnv *env , jobject obj, jstring key, jstring value, jboolean overwrite) {
+JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv(
+    JNIEnv *env, jobject obj, jstring key, jstring value, jboolean overwrite) {
 
-    char* k = (char *) env->GetStringUTFChars(key, NULL);
-    char* v = (char *) env->GetStringUTFChars(value, NULL);
+    char *k = (char *)env->GetStringUTFChars(key, NULL);
+    char *v = (char *)env->GetStringUTFChars(value, NULL);
 
     int err = setenv(k, v, overwrite);
 
@@ -116,15 +138,12 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
     return err;
 }
 
-static const int CCL_IP_LEN = 128;
-std::list<std::string> local_host_ips;
-
 static int fill_local_host_ip() {
     struct ifaddrs *ifaddr, *ifa;
     int family = AF_UNSPEC;
     char local_ip[CCL_IP_LEN];
     if (getifaddrs(&ifaddr) < 0) {
-        // LOG_ERROR("fill_local_host_ip: can not get host IP");
+        std::cerr << "OneCCL (native): can not get host IP" << std::endl;
         return -1;
     }
 
@@ -140,16 +159,13 @@ static int fill_local_host_ip() {
                 memset(local_ip, 0, CCL_IP_LEN);
                 int res = getnameinfo(
                     ifa->ifa_addr,
-                    (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6),
-                    local_ip,
-                    CCL_IP_LEN,
-                    NULL,
-                    0,
-                    NI_NUMERICHOST);
+                    (family == AF_INET) ? sizeof(struct sockaddr_in)
+                                        : sizeof(struct sockaddr_in6),
+                    local_ip, CCL_IP_LEN, NULL, 0, NI_NUMERICHOST);
                 if (res != 0) {
-                    std::string s("fill_local_host_ip: getnameinfo error > ");
+                    std::string s("OneCCL (native): getnameinfo error > ");
                     s.append(gai_strerror(res));
-                    // LOG_ERROR(s.c_str());
+                    std::cerr << s << std::endl;
                     return -1;
                 }
                 local_host_ips.push_back(local_ip);
@@ -157,30 +173,30 @@ static int fill_local_host_ip() {
         }
     }
     if (local_host_ips.empty()) {
-        // LOG_ERROR("fill_local_host_ip: can't find interface to get host IP");
+        std::cerr << "OneCCL (native): can't find interface to get host IP"
+                  << std::endl;
         return -1;
     }
-    // memset(local_host_ip, 0, CCL_IP_LEN);
-    // strncpy(local_host_ip, local_host_ips.front().c_str(), CCL_IP_LEN);
-
-    // for (auto &ip : local_host_ips)
-    //   cout << ip << endl;
 
     freeifaddrs(ifaddr);
+
     return 0;
 }
 
 static bool is_valid_ip(char ip[]) {
-  if (fill_local_host_ip() == -1) {
-    std::cerr << "fill_local_host_ip error" << std::endl;
-  };
-  for (std::list<std::string>::iterator it = local_host_ips.begin(); it != local_host_ips.end(); ++it) {
-    if (*it == ip) {
-      return true;
+    if (fill_local_host_ip() == -1) {
+        std::cerr << "OneCCL (native): get local host ip error" << std::endl;
+        return false;
+    };
+
+    for (std::list<std::string>::iterator it = local_host_ips.begin();
+         it != local_host_ips.end(); ++it) {
+        if (*it == ip) {
+            return true;
+        }
     }
-  }
 
-  return false;
+    return false;
 }
 
 /*
@@ -188,42 +204,44 @@ static bool is_valid_ip(char ip[]) {
  * Method:    getAvailPort
  * Signature: (Ljava/lang/String;)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort
-  (JNIEnv *env, jobject obj, jstring localIP) {
-
-  // start from beginning of dynamic port
-  const int port_start_base = 3000;
+JNIEXPORT jint JNICALL
+Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort(JNIEnv *env,
+                                                           jobject obj,
+                                                           jstring localIP) {
 
-  char* local_host_ip = (char *) env->GetStringUTFChars(localIP, NULL);
+    // start from beginning of dynamic port
+    const int port_start_base = 3000;
 
-  // check if the input ip is one of host's ips
-  if (!is_valid_ip(local_host_ip))
-    return -1;
+    char *local_host_ip = (char *)env->GetStringUTFChars(localIP, NULL);
 
-  struct sockaddr_in main_server_address;
-  int server_listen_sock;
-  in_port_t port = port_start_base;
+    // check if the input ip is one of host's ips
+    if (!is_valid_ip(local_host_ip))
+        return -1;
 
-  if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-    perror("OneCCL (native) getAvailPort error!");
-    return -1;
-  }
+    struct sockaddr_in main_server_address;
+    int server_listen_sock;
+    in_port_t port = port_start_base;
 
-  main_server_address.sin_family = AF_INET;
-  main_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
-  main_server_address.sin_port = htons(port);
+    if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("OneCCL (native) getAvailPort error!");
+        return -1;
+    }
 
-  // search for available port
-  while (bind(server_listen_sock,
-         (const struct sockaddr *)&main_server_address,
-         sizeof(main_server_address)) < 0) {
-    port++;
+    main_server_address.sin_family = AF_INET;
+    main_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
     main_server_address.sin_port = htons(port);
-  }
 
-  close(server_listen_sock);  
+    // search for available port
+    while (bind(server_listen_sock,
+                (const struct sockaddr *)&main_server_address,
+                sizeof(main_server_address)) < 0) {
+        port++;
+        main_server_address.sin_port = htons(port);
+    }
+
+    close(server_listen_sock);
 
-  env->ReleaseStringUTFChars(localIP, local_host_ip);
+    env->ReleaseStringUTFChars(localIP, local_host_ip);
 
-  return port;
+    return port;
 }
diff --git a/mllib-dal/src/main/native/OneCCL.h b/mllib-dal/src/main/native/OneCCL.h
index b579c4697..056d898d3 100644
--- a/mllib-dal/src/main/native/OneCCL.h
+++ b/mllib-dal/src/main/native/OneCCL.h
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
 #pragma once
 
 #include <oneapi/ccl.hpp>
diff --git a/mllib-dal/src/main/native/OneDAL.cpp b/mllib-dal/src/main/native/OneDAL.cpp
index 792225c3e..9c3c48657 100644
--- a/mllib-dal/src/main/native/OneDAL.cpp
+++ b/mllib-dal/src/main/native/OneDAL.cpp
@@ -1,24 +1,24 @@
 /*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
+#include <cstring>
 #include <daal.h>
 #include <iostream>
-#include <cstring>
-#include "org_apache_spark_ml_util_OneDAL__.h"
 
+#include "org_apache_spark_ml_util_OneDAL__.h"
 #include "service.h"
 
 using namespace daal;
@@ -32,97 +32,72 @@ extern bool daal_check_is_intel_cpu();
  * Method:    setNumericTableValue
  * Signature: (JIID)V
  */
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_setNumericTableValue
-  (JNIEnv *, jobject, jlong numTableAddr, jint row, jint column, jdouble value) {
-
-  HomogenNumericTable<double> * nt = static_cast<HomogenNumericTable<double> *>(((SerializationIfacePtr *)numTableAddr)->get());
-  (*nt)[row][column]               = (double)value;
-
+JNIEXPORT void JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_setNumericTableValue(
+    JNIEnv *, jobject, jlong numTableAddr, jint row, jint column,
+    jdouble value) {
+    HomogenNumericTable<double> *nt =
+        static_cast<HomogenNumericTable<double> *>(
+            ((SerializationIfacePtr *)numTableAddr)->get());
+    (*nt)[row][column] = (double)value;
 }
 
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cSetDoubleBatch
-  (JNIEnv *env, jobject, jlong numTableAddr, jint curRows, jdoubleArray batch, jint numRows, jint numCols) {
-
-    HomogenNumericTable<double> *nt = static_cast<HomogenNumericTable<double> *>(
-                ((SerializationIfacePtr *)numTableAddr)->get());
-    jdouble* values = (jdouble*)env->GetPrimitiveArrayCritical(batch, 0);
+/*
+ * Class:     org_apache_spark_ml_util_OneDAL__
+ * Method:    cSetDoubleBatch
+ * Signature: (JI[DII)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_cSetDoubleBatch(
+    JNIEnv *env, jobject, jlong numTableAddr, jint curRows, jdoubleArray batch,
+    jint numRows, jint numCols) {
+    HomogenNumericTable<double> *nt =
+        static_cast<HomogenNumericTable<double> *>(
+            ((SerializationIfacePtr *)numTableAddr)->get());
+    jdouble *values = (jdouble *)env->GetPrimitiveArrayCritical(batch, 0);
     std::memcpy((*nt)[curRows], values, numRows * numCols * sizeof(double));
     env->ReleasePrimitiveArrayCritical(batch, values, JNI_ABORT);
-  }
-
-
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cSetDoubleIterator
-  (JNIEnv *env, jobject, jlong numTableAddr, jobject jiter, jint curRows) {
-    
-    jclass iterClass = env->FindClass("java/util/Iterator");
-    jmethodID hasNext = env->GetMethodID(iterClass,
-                                          "hasNext", "()Z");
-    jmethodID next = env->GetMethodID(iterClass,
-                                       "next", "()Ljava/lang/Object;");
-
-    HomogenNumericTable<double> *nt = static_cast<HomogenNumericTable<double> *>(
-                ((SerializationIfacePtr *)numTableAddr)->get());
-    
-    while (env->CallBooleanMethod(jiter, hasNext)) {
-         jobject batch = env->CallObjectMethod(jiter, next);
-		 
-         jclass batchClass = env->GetObjectClass(batch);
-         jlongArray joffset = (jlongArray)env->GetObjectField(
-              batch, env->GetFieldID(batchClass, "rowOffset", "[J"));
-         jdoubleArray jvalue = (jdoubleArray)env->GetObjectField(
-              batch, env->GetFieldID(batchClass, "values", "[D"));
-         jint jcols = env->GetIntField(
-              batch, env->GetFieldID(batchClass, "numCols", "I"));
-
-         long numRows = env->GetArrayLength(joffset);
-
-         jlong* rowOffset = env->GetLongArrayElements(joffset, 0);
-		 
-		 jdouble* values = env->GetDoubleArrayElements(jvalue, 0);
-
-         for (int i = 0; i < numRows; i ++){
-            jlong curRow = rowOffset[i] + curRows;
-            for(int j = 0; j < jcols; j ++) {
-                (*nt)[curRow][j] = values[rowOffset[i] * jcols + j];
-            }
-         }
-        
-         env->ReleaseLongArrayElements(joffset, rowOffset, 0);
-         env->DeleteLocalRef(joffset);
-         env->ReleaseDoubleArrayElements(jvalue, values, 0);
-         env->DeleteLocalRef(jvalue);
-         env->DeleteLocalRef(batch);
-         env->DeleteLocalRef(batchClass);
-  }
-  env->DeleteLocalRef(iterClass);
-
-  }
-
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cAddNumericTable
-  (JNIEnv *, jobject,  jlong rowMergedNumericTableAddr, jlong numericTableAddr) {
-    
-    data_management::RowMergedNumericTablePtr pRowMergedNumericTable = (*(data_management::RowMergedNumericTablePtr *)rowMergedNumericTableAddr); 
-    data_management::NumericTablePtr pNumericTable = (*(data_management::NumericTablePtr *)numericTableAddr);
-    pRowMergedNumericTable->addNumericTable(pNumericTable);
-
-  }
+}
 
-JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cFreeDataMemory
-  (JNIEnv *, jobject, jlong numericTableAddr) {
+/*
+ * Class:     org_apache_spark_ml_util_OneDAL__
+ * Method:    cAddNumericTable
+ * Signature: (JJ)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_cAddNumericTable(
+    JNIEnv *, jobject, jlong rowMergedNumericTableAddr,
+    jlong numericTableAddr) {
+    data_management::RowMergedNumericTablePtr pRowMergedNumericTable = (*(
+        data_management::RowMergedNumericTablePtr *)rowMergedNumericTableAddr);
+    data_management::NumericTablePtr pNumericTable =
+        (*(data_management::NumericTablePtr *)numericTableAddr);
+    pRowMergedNumericTable->addNumericTable(pNumericTable);
+}
 
-    data_management::NumericTablePtr pNumericTable = (*(data_management::NumericTablePtr *)numericTableAddr);
+/*
+ * Class:     org_apache_spark_ml_util_OneDAL__
+ * Method:    cFreeDataMemory
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_cFreeDataMemory(
+    JNIEnv *, jobject, jlong numericTableAddr) {
+    data_management::NumericTablePtr pNumericTable =
+        (*(data_management::NumericTablePtr *)numericTableAddr);
     pNumericTable->freeDataMemory();
-   
-  }
+}
 
 /*
  * Class:     org_apache_spark_ml_util_OneDAL__
  * Method:    cCheckPlatformCompatibility
  * Signature: ()Z
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlatformCompatibility
-  (JNIEnv *, jobject) {
-    // Only guarantee compatibility and performance on Intel platforms, use oneDAL lib function
+JNIEXPORT jboolean JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlatformCompatibility(
+    JNIEnv *, jobject) {
+    // Only guarantee compatibility and performance on Intel platforms, use
+    // oneDAL lib function
     return daal_check_is_intel_cpu();
 }
 
@@ -131,35 +106,32 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlat
  * Method:    cNewCSRNumericTable
  * Signature: ([F[J[JJJ)J
  */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable
-  (JNIEnv *env, jobject, jfloatArray data, jlongArray colIndices, jlongArray rowOffsets, jlong nFeatures, jlong nVectors) {
+JNIEXPORT jlong JNICALL
+Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable(
+    JNIEnv *env, jobject, jfloatArray data, jlongArray colIndices,
+    jlongArray rowOffsets, jlong nFeatures, jlong nVectors) {
 
     long numData = env->GetArrayLength(data);
-    // long numColIndices = numData;
-    // long numRowOffsets = env->GetArrayLength(rowOffsets);
 
-    size_t * resultRowOffsets      = NULL;
-    size_t * resultColIndices      = NULL;
-    float  * resultData         = NULL;            
-    CSRNumericTable * numericTable = new CSRNumericTable(resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);    
-    numericTable->allocateDataMemory(numData);
-    numericTable->getArrays<float>(&resultData, &resultColIndices, &resultRowOffsets);
+    size_t *resultRowOffsets = NULL;
+    size_t *resultColIndices = NULL;
+    float *resultData = NULL;
 
-    size_t * pRowOffsets = (size_t *)env->GetLongArrayElements(rowOffsets, 0);
-    size_t * pColIndices = (size_t *)env->GetLongArrayElements(colIndices, 0);
-    float * pData       = env->GetFloatArrayElements(data, 0);
+    CSRNumericTable *numericTable = new CSRNumericTable(
+        resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);
+    numericTable->allocateDataMemory(numData);
+    numericTable->getArrays<float>(&resultData, &resultColIndices,
+                                   &resultRowOffsets);
 
-    // std::memcpy(resultRowOffsets, pRowOffsets, numRowOffsets*sizeof(jlong));
-    // std::memcpy(resultColIndices, pColIndices, numColIndices*sizeof(jlong));
-    // std::memcpy(resultData, pData, numData*sizeof(float));
+    size_t *pRowOffsets = (size_t *)env->GetLongArrayElements(rowOffsets, 0);
+    size_t *pColIndices = (size_t *)env->GetLongArrayElements(colIndices, 0);
+    float *pData = env->GetFloatArrayElements(data, 0);
 
-    for (size_t i = 0; i < (size_t)numData; ++i)
-    {
-        resultData[i]       = pData[i];
+    for (size_t i = 0; i < (size_t)numData; ++i) {
+        resultData[i] = pData[i];
         resultColIndices[i] = pColIndices[i];
     }
-    for (size_t i = 0; i < (size_t)nVectors + 1; ++i)
-    {
+    for (size_t i = 0; i < (size_t)nVectors + 1; ++i) {
         resultRowOffsets[i] = pRowOffsets[i];
     }
 
@@ -169,7 +141,5 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumeri
 
     CSRNumericTablePtr *ret = new CSRNumericTablePtr(numericTable);
 
-    //printNumericTable(*ret, "cNewCSRNumericTable", 10);
-
     return (jlong)ret;
 }
diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp
index 95172d05f..c04484a09 100644
--- a/mllib-dal/src/main/native/PCADALImpl.cpp
+++ b/mllib-dal/src/main/native/PCADALImpl.cpp
@@ -1,12 +1,26 @@
-#include <daal.h>
-
-#include "service.h"
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
 #include <chrono>
+#include <daal.h>
 #include <iostream>
 
-#include "org_apache_spark_ml_feature_PCADALImpl.h"
 #include "OneCCL.h"
+#include "org_apache_spark_ml_feature_PCADALImpl.h"
+#include "service.h"
 
 using namespace std;
 using namespace daal;
@@ -21,127 +35,156 @@ typedef double algorithmFPType; /* Algorithm floating-point type */
  * Method:    cPCATrainDAL
  * Signature: (JIIILorg/apache/spark/ml/feature/PCAResult;)J
  */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL(
-    JNIEnv *env, jobject obj, jlong pNumTabData, jint k, jint executor_num, jint executor_cores,
-    jobject resultObj) {
-
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
-
-  const size_t nBlocks = executor_num;
-  const int comm_size = executor_num;
-
-  NumericTablePtr pData = *((NumericTablePtr*)pNumTabData);
-  // Source data already normalized
-  pData->setNormalizationFlag(NumericTableIface::standardScoreNormalized);
-
-  // Set number of threads for oneDAL to use for each rank
-  services::Environment::getInstance()->setNumberOfThreads(executor_cores);
-
-  int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
-  cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
-
-  auto t1 = std::chrono::high_resolution_clock::now();
-
-  pca::Distributed<step1Local, algorithmFPType, pca::svdDense> localAlgorithm;
-
-  /* Set the input data set to the algorithm */
-  localAlgorithm.input.set(pca::data, pData);
-
-  /* Compute PCA decomposition */
-  localAlgorithm.compute();
-
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cout << "PCA (native): local step took " << duration << " secs" << std::endl;
+JNIEXPORT jlong JNICALL
+Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL(
+    JNIEnv *env, jobject obj, jlong pNumTabData, jint k, jint executor_num,
+    jint executor_cores, jobject resultObj) {
 
-  t1 = std::chrono::high_resolution_clock::now();
+    ccl::communicator &comm = getComm();
+    size_t rankId = comm.rank();
 
-  /* Serialize partial results required by step 2 */
-  services::SharedPtr<byte> serializedData;
-  InputDataArchive dataArch;
-  localAlgorithm.getPartialResult()->serialize(dataArch);
-  size_t perNodeArchLength = dataArch.getSizeOfArchive();
+    const size_t nBlocks = executor_num;
+    const int comm_size = executor_num;
 
-  serializedData = services::SharedPtr<byte>(new byte[perNodeArchLength * nBlocks]);
+    NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
+    // Source data already normalized
+    pData->setNormalizationFlag(NumericTableIface::standardScoreNormalized);
 
-  byte* nodeResults = new byte[perNodeArchLength];
-  dataArch.copyArchiveToArray(nodeResults, perNodeArchLength);
+    // Set number of threads for oneDAL to use for each rank
+    services::Environment::getInstance()->setNumberOfThreads(executor_cores);
 
-  t2 = std::chrono::high_resolution_clock::now();
+    int nThreadsNew =
+        services::Environment::getInstance()->getNumberOfThreads();
+    cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew
+         << endl;
 
-  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cout << "PCA (native): serializing partial results took " << duration << " secs" << std::endl;
-
-  vector<size_t> recv_counts(comm_size * perNodeArchLength);
-  for (int i = 0; i < comm_size; i++) recv_counts[i] = perNodeArchLength;
-
-  cout << "PCA (native): ccl_allgatherv receiving " << perNodeArchLength * nBlocks << " bytes" << endl;
-
-  t1 = std::chrono::high_resolution_clock::now();
-
-  /* Transfer partial results to step 2 on the root node */
-  // MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR, serializedData.get(),
-  // perNodeArchLength, MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-  ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
-                  ccl::datatype::uint8, comm).wait();
-
-  t2 = std::chrono::high_resolution_clock::now();
-
-  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs" << std::endl;
-
-  if (rankId == ccl_root) {
     auto t1 = std::chrono::high_resolution_clock::now();
 
-    /* Create an algorithm for principal component analysis using the svdDense method
-     * on the master node */
-    pca::Distributed<step2Master, algorithmFPType, pca::svdDense> masterAlgorithm;
+    pca::Distributed<step1Local, algorithmFPType, pca::svdDense> localAlgorithm;
 
-    for (size_t i = 0; i < nBlocks; i++) {
-      /* Deserialize partial results from step 1 */
-      OutputDataArchive dataArch(serializedData.get() + perNodeArchLength * i,
-                                 perNodeArchLength);
+    /* Set the input data set to the algorithm */
+    localAlgorithm.input.set(pca::data, pData);
 
-      services::SharedPtr<pca::PartialResult<pca::svdDense> >
-          dataForStep2FromStep1 =
-              services::SharedPtr<pca::PartialResult<pca::svdDense> >(
-                  new pca::PartialResult<pca::svdDense>());
-      dataForStep2FromStep1->deserialize(dataArch);
-
-      /* Set local partial results as input for the master-node algorithm */
-      masterAlgorithm.input.add(pca::partialResults, dataForStep2FromStep1);
-    }
-
-    /* Merge and finalizeCompute PCA decomposition on the master node */
-    masterAlgorithm.compute();
-    masterAlgorithm.finalizeCompute();
-
-    /* Retrieve the algorithm results */
-    pca::ResultPtr result = masterAlgorithm.getResult();
+    /* Compute PCA decomposition */
+    localAlgorithm.compute();
 
     auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-    std::cout << "PCA (native): master step took " << duration << " secs" << std::endl;
-
-    /* Print the results */
-    printNumericTable(result->get(pca::eigenvalues), "First 10 eigenvalues with first 20 dimensions:", 10, 20);
-    printNumericTable(result->get(pca::eigenvectors), "First 10 eigenvectors with first 20 dimensions:", 10, 20);
-
-    // Return all eigenvalues & eigenvectors
-
-    // Get the class of the input object
-    jclass clazz = env->GetObjectClass(resultObj);
-    // Get Field references
-    jfieldID pcNumericTableField = env->GetFieldID(clazz, "pcNumericTable", "J");
-    jfieldID explainedVarianceNumericTableField = env->GetFieldID(clazz, "explainedVarianceNumericTable", "J");
-
-    NumericTablePtr *eigenvalues = new NumericTablePtr(result->get(pca::eigenvalues));
-    NumericTablePtr *eigenvectors = new NumericTablePtr(result->get(pca::eigenvectors));
-
-    env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors);
-    env->SetLongField(resultObj, explainedVarianceNumericTableField, (jlong)eigenvalues);
-  }
+    auto duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "PCA (native): local step took " << duration << " secs"
+              << std::endl;
+
+    t1 = std::chrono::high_resolution_clock::now();
+
+    /* Serialize partial results required by step 2 */
+    services::SharedPtr<byte> serializedData;
+    InputDataArchive dataArch;
+    localAlgorithm.getPartialResult()->serialize(dataArch);
+    size_t perNodeArchLength = dataArch.getSizeOfArchive();
+
+    serializedData =
+        services::SharedPtr<byte>(new byte[perNodeArchLength * nBlocks]);
+
+    byte *nodeResults = new byte[perNodeArchLength];
+    dataArch.copyArchiveToArray(nodeResults, perNodeArchLength);
+
+    t2 = std::chrono::high_resolution_clock::now();
+
+    duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "PCA (native): serializing partial results took " << duration
+              << " secs" << std::endl;
+
+    vector<size_t> recv_counts(comm_size * perNodeArchLength);
+    for (int i = 0; i < comm_size; i++)
+        recv_counts[i] = perNodeArchLength;
+
+    cout << "PCA (native): ccl_allgatherv receiving "
+         << perNodeArchLength * nBlocks << " bytes" << endl;
+
+    t1 = std::chrono::high_resolution_clock::now();
+
+    /* Transfer partial results to step 2 on the root node */
+    // MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR,
+    // serializedData.get(), perNodeArchLength, MPI_CHAR, ccl_root,
+    // MPI_COMM_WORLD);
+    ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(),
+                    recv_counts, ccl::datatype::uint8, comm)
+        .wait();
+
+    t2 = std::chrono::high_resolution_clock::now();
+
+    duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs"
+              << std::endl;
+
+    if (rankId == ccl_root) {
+        auto t1 = std::chrono::high_resolution_clock::now();
+
+        /* Create an algorithm for principal component analysis using the
+         * svdDense method on the master node */
+        pca::Distributed<step2Master, algorithmFPType, pca::svdDense>
+            masterAlgorithm;
+
+        for (size_t i = 0; i < nBlocks; i++) {
+            /* Deserialize partial results from step 1 */
+            OutputDataArchive dataArch(serializedData.get() +
+                                           perNodeArchLength * i,
+                                       perNodeArchLength);
+
+            services::SharedPtr<pca::PartialResult<pca::svdDense>>
+                dataForStep2FromStep1 =
+                    services::SharedPtr<pca::PartialResult<pca::svdDense>>(
+                        new pca::PartialResult<pca::svdDense>());
+            dataForStep2FromStep1->deserialize(dataArch);
+
+            /* Set local partial results as input for the master-node algorithm
+             */
+            masterAlgorithm.input.add(pca::partialResults,
+                                      dataForStep2FromStep1);
+        }
+
+        /* Merge and finalizeCompute PCA decomposition on the master node */
+        masterAlgorithm.compute();
+        masterAlgorithm.finalizeCompute();
+
+        /* Retrieve the algorithm results */
+        pca::ResultPtr result = masterAlgorithm.getResult();
+
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+        std::cout << "PCA (native): master step took " << duration << " secs"
+                  << std::endl;
+
+        /* Print the results */
+        printNumericTable(result->get(pca::eigenvalues),
+                          "First 10 eigenvalues with first 20 dimensions:", 10,
+                          20);
+        printNumericTable(result->get(pca::eigenvectors),
+                          "First 10 eigenvectors with first 20 dimensions:", 10,
+                          20);
+
+        // Return all eigenvalues & eigenvectors
+
+        // Get the class of the input object
+        jclass clazz = env->GetObjectClass(resultObj);
+        // Get Field references
+        jfieldID pcNumericTableField =
+            env->GetFieldID(clazz, "pcNumericTable", "J");
+        jfieldID explainedVarianceNumericTableField =
+            env->GetFieldID(clazz, "explainedVarianceNumericTable", "J");
+
+        NumericTablePtr *eigenvalues =
+            new NumericTablePtr(result->get(pca::eigenvalues));
+        NumericTablePtr *eigenvectors =
+            new NumericTablePtr(result->get(pca::eigenvectors));
+
+        env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors);
+        env->SetLongField(resultObj, explainedVarianceNumericTableField,
+                          (jlong)eigenvalues);
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/mllib-dal/src/main/native/build-jni.sh b/mllib-dal/src/main/native/build-jni.sh
index dacd8382b..3a07d62fc 100755
--- a/mllib-dal/src/main/native/build-jni.sh
+++ b/mllib-dal/src/main/native/build-jni.sh
@@ -1,5 +1,19 @@
 #!/usr/bin/env bash
 
+# Copyright 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 WORK_DIR="$( cd $( dirname "${BASH_SOURCE[0]}" ) && pwd )"
 
 DAAL_JAR=${ONEAPI_ROOT}/dal/latest/lib/onedal.jar
diff --git a/mllib-dal/src/main/native/build.sh b/mllib-dal/src/main/native/build.sh
index 763cf4bbe..d271c5d97 100755
--- a/mllib-dal/src/main/native/build.sh
+++ b/mllib-dal/src/main/native/build.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
 
+# Copyright 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 make clean
 make -j
diff --git a/mllib-dal/src/main/native/error_handling.cpp b/mllib-dal/src/main/native/error_handling.cpp
index 2cb9a7270..ebd196901 100644
--- a/mllib-dal/src/main/native/error_handling.cpp
+++ b/mllib-dal/src/main/native/error_handling.cpp
@@ -1,19 +1,19 @@
 /* file: error_handling.h */
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright 2017-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
 /*
 !  Content:
@@ -27,38 +27,31 @@
 
 const int fileError = -1001;
 
-void checkAllocation(void * ptr)
-{
-    if (!ptr)
-    {
+void checkAllocation(void *ptr) {
+    if (!ptr) {
         std::cout << "Error: Memory allocation failed" << std::endl;
         exit(-1);
     }
 }
 
-void checkPtr(void * ptr)
-{
-    if (!ptr)
-    {
+void checkPtr(void *ptr) {
+    if (!ptr) {
         std::cout << "Error: NULL pointer" << std::endl;
         exit(-2);
     }
 }
 
-void fileOpenError(const char * filename)
-{
+void fileOpenError(const char *filename) {
     std::cout << "Unable to open file '" << filename << "'" << std::endl;
     exit(fileError);
 }
 
-void fileReadError()
-{
+void fileReadError() {
     std::cout << "Unable to read next line" << std::endl;
     exit(fileError);
 }
 
-void sparceFileReadError()
-{
+void sparceFileReadError() {
     std::cout << "Incorrect format of file" << std::endl;
     exit(fileError);
 }
diff --git a/mllib-dal/src/main/native/error_handling.h b/mllib-dal/src/main/native/error_handling.h
index c157a1ada..7852cab24 100644
--- a/mllib-dal/src/main/native/error_handling.h
+++ b/mllib-dal/src/main/native/error_handling.h
@@ -1,19 +1,19 @@
 /* file: error_handling.h */
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright 2017-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
 /*
 !  Content:
@@ -25,9 +25,9 @@
 
 const int fileError = -1001;
 
-void checkAllocation(void * ptr);
-void checkPtr(void * ptr);
-void fileOpenError(const char * filename);
+void checkAllocation(void *ptr);
+void checkPtr(void *ptr);
+void fileOpenError(const char *filename);
 void fileReadError();
 void sparceFileReadError();
 
diff --git a/mllib-dal/src/main/native/service.cpp b/mllib-dal/src/main/native/service.cpp
index 623767406..7cb26c385 100644
--- a/mllib-dal/src/main/native/service.cpp
+++ b/mllib-dal/src/main/native/service.cpp
@@ -1,11 +1,32 @@
+/* file: service.cpp */
+/*******************************************************************************
+ * Copyright 2017-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+/*
+!  Content:
+!    Auxiliary functions used in C++ samples
+!******************************************************************************/
+
 #include "service.h"
 #include "error_handling.h"
 
-size_t readTextFile(const std::string & datasetFileName, daal::byte ** data)
-{
-    std::ifstream file(datasetFileName.c_str(), std::ios::binary | std::ios::ate);
-    if (!file.is_open())
-    {
+size_t readTextFile(const std::string &datasetFileName, daal::byte **data) {
+    std::ifstream file(datasetFileName.c_str(),
+                       std::ios::binary | std::ios::ate);
+    if (!file.is_open()) {
         fileOpenError(datasetFileName.c_str());
     }
 
@@ -17,8 +38,7 @@ size_t readTextFile(const std::string & datasetFileName, daal::byte ** data)
     (*data) = new daal::byte[fileSize];
     checkAllocation(data);
 
-    if (!file.read((char *)(*data), fileSize))
-    {
+    if (!file.read((char *)(*data), fileSize)) {
         delete[] data;
         fileReadError();
     }
@@ -27,12 +47,11 @@ size_t readTextFile(const std::string & datasetFileName, daal::byte ** data)
 }
 
 template <typename item_type>
-void readLine(std::string & line, size_t nCols, item_type * data, size_t firstPos = 0)
-{
+void readLine(std::string &line, size_t nCols, item_type *data,
+              size_t firstPos = 0) {
     std::stringstream iss(line);
 
-    for (size_t col = 0; col < nCols; ++col)
-    {
+    for (size_t col = 0; col < nCols; ++col) {
         std::string val;
         std::getline(iss, val, ',');
 
@@ -42,138 +61,141 @@ void readLine(std::string & line, size_t nCols, item_type * data, size_t firstPo
 }
 
 template <typename item_type>
-void readRowUnknownLength(char * line, std::vector<item_type> & data)
-{
-    size_t n               = 0;
-    const char * prevDelim = line - 1;
-    char * ptr             = line;
-    for (; *ptr; ++ptr)
-    {
-        if (*ptr == ',' || *ptr == '\r')
-        {
-            if (prevDelim != ptr - 1) ++n;
-            *ptr      = ' ';
+void readRowUnknownLength(char *line, std::vector<item_type> &data) {
+    size_t n = 0;
+    const char *prevDelim = line - 1;
+    char *ptr = line;
+    for (; *ptr; ++ptr) {
+        if (*ptr == ',' || *ptr == '\r') {
+            if (prevDelim != ptr - 1)
+                ++n;
+            *ptr = ' ';
             prevDelim = ptr;
         }
     }
-    if (prevDelim != ptr - 1) ++n;
+    if (prevDelim != ptr - 1)
+        ++n;
     data.resize(n);
     std::stringstream iss(line);
-    for (size_t i = 0; i < n; ++i)
-    {
+    for (size_t i = 0; i < n; ++i) {
         iss >> data[i];
     }
 }
 
 template <typename item_type>
-CSRNumericTable * createSparseTable(const std::string & datasetFileName)
-{
+CSRNumericTable *createSparseTable(const std::string &datasetFileName) {
     std::ifstream file(datasetFileName.c_str());
 
-    if (!file.is_open())
-    {
+    if (!file.is_open()) {
         fileOpenError(datasetFileName.c_str());
     }
 
     std::string str;
 
-    //read row offsets
+    // read row offsets
     std::getline(file, str);
     std::vector<size_t> rowOffsets;
     readRowUnknownLength<size_t>(&str[0], rowOffsets);
-    if (!rowOffsets.size()) return NULL;
+    if (!rowOffsets.size())
+        return NULL;
     const size_t nVectors = rowOffsets.size() - 1;
 
-    //read cols indices
+    // read cols indices
     std::getline(file, str);
     std::vector<size_t> colIndices;
     readRowUnknownLength<size_t>(&str[0], colIndices);
 
-    //read values
+    // read values
     std::getline(file, str);
     std::vector<item_type> data;
     readRowUnknownLength<item_type>(&str[0], data);
     const size_t nNonZeros = data.size();
 
     size_t maxCol = 0;
-    for (size_t i = 0; i < colIndices.size(); ++i)
-    {
-        if (colIndices[i] > maxCol) maxCol = colIndices[i];
+    for (size_t i = 0; i < colIndices.size(); ++i) {
+        if (colIndices[i] > maxCol)
+            maxCol = colIndices[i];
     }
     const size_t nFeatures = maxCol;
 
-    if (!nFeatures || !nVectors || colIndices.size() != nNonZeros || nNonZeros != (rowOffsets[nVectors] - 1))
-    {
+    if (!nFeatures || !nVectors || colIndices.size() != nNonZeros ||
+        nNonZeros != (rowOffsets[nVectors] - 1)) {
         sparceFileReadError();
     }
 
-    size_t * resultRowOffsets      = NULL;
-    size_t * resultColIndices      = NULL;
-    item_type * resultData         = NULL;
-    CSRNumericTable * numericTable = new CSRNumericTable(resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);
+    size_t *resultRowOffsets = NULL;
+    size_t *resultColIndices = NULL;
+    item_type *resultData = NULL;
+    CSRNumericTable *numericTable = new CSRNumericTable(
+        resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);
     numericTable->allocateDataMemory(nNonZeros);
-    numericTable->getArrays<item_type>(&resultData, &resultColIndices, &resultRowOffsets);
-    for (size_t i = 0; i < nNonZeros; ++i)
-    {
-        resultData[i]       = data[i];
+    numericTable->getArrays<item_type>(&resultData, &resultColIndices,
+                                       &resultRowOffsets);
+    for (size_t i = 0; i < nNonZeros; ++i) {
+        resultData[i] = data[i];
         resultColIndices[i] = colIndices[i];
     }
-    for (size_t i = 0; i < nVectors + 1; ++i)
-    {
+    for (size_t i = 0; i < nVectors + 1; ++i) {
         resultRowOffsets[i] = rowOffsets[i];
     }
     return numericTable;
 }
 
-CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName) {
+CSRNumericTable *createFloatSparseTable(const std::string &datasetFileName) {
     return createSparseTable<float>(datasetFileName);
 }
 
-void printAprioriItemsets(NumericTablePtr largeItemsetsTable, NumericTablePtr largeItemsetsSupportTable, size_t nItemsetToPrint = 20)
-{
-    size_t largeItemsetCount     = largeItemsetsSupportTable->getNumberOfRows();
+void printAprioriItemsets(NumericTablePtr largeItemsetsTable,
+                          NumericTablePtr largeItemsetsSupportTable,
+                          size_t nItemsetToPrint = 20) {
+    size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows();
     size_t nItemsInLargeItemsets = largeItemsetsTable->getNumberOfRows();
 
     BlockDescriptor<int> block1;
-    largeItemsetsTable->getBlockOfRows(0, nItemsInLargeItemsets, readOnly, block1);
-    int * largeItemsets = block1.getBlockPtr();
+    largeItemsetsTable->getBlockOfRows(0, nItemsInLargeItemsets, readOnly,
+                                       block1);
+    int *largeItemsets = block1.getBlockPtr();
 
     BlockDescriptor<int> block2;
-    largeItemsetsSupportTable->getBlockOfRows(0, largeItemsetCount, readOnly, block2);
-    int * largeItemsetsSupportData = block2.getBlockPtr();
+    largeItemsetsSupportTable->getBlockOfRows(0, largeItemsetCount, readOnly,
+                                              block2);
+    int *largeItemsetsSupportData = block2.getBlockPtr();
 
-    std::vector<std::vector<size_t> > largeItemsetsVector;
+    std::vector<std::vector<size_t>> largeItemsetsVector;
     largeItemsetsVector.resize(largeItemsetCount);
 
-    for (size_t i = 0; i < nItemsInLargeItemsets; i++)
-    {
-        largeItemsetsVector[largeItemsets[2 * i]].push_back(largeItemsets[2 * i + 1]);
+    for (size_t i = 0; i < nItemsInLargeItemsets; i++) {
+        largeItemsetsVector[largeItemsets[2 * i]].push_back(
+            largeItemsets[2 * i + 1]);
     }
 
     std::vector<size_t> supportVector;
     supportVector.resize(largeItemsetCount);
 
-    for (size_t i = 0; i < largeItemsetCount; i++)
-    {
-        supportVector[largeItemsetsSupportData[2 * i]] = largeItemsetsSupportData[2 * i + 1];
+    for (size_t i = 0; i < largeItemsetCount; i++) {
+        supportVector[largeItemsetsSupportData[2 * i]] =
+            largeItemsetsSupportData[2 * i + 1];
     }
 
     std::cout << std::endl << "Apriori example program results" << std::endl;
 
-    std::cout << std::endl << "Last " << nItemsetToPrint << " large itemsets: " << std::endl;
+    std::cout << std::endl
+              << "Last " << nItemsetToPrint << " large itemsets: " << std::endl;
     std::cout << std::endl
               << "Itemset"
               << "\t\t\tSupport" << std::endl;
 
-    size_t iMin = (((largeItemsetCount > nItemsetToPrint) && (nItemsetToPrint != 0)) ? largeItemsetCount - nItemsetToPrint : 0);
-    for (size_t i = iMin; i < largeItemsetCount; i++)
-    {
+    size_t iMin =
+        (((largeItemsetCount > nItemsetToPrint) && (nItemsetToPrint != 0))
+             ? largeItemsetCount - nItemsetToPrint
+             : 0);
+    for (size_t i = iMin; i < largeItemsetCount; i++) {
         std::cout << "{";
-        for (size_t l = 0; l < largeItemsetsVector[i].size() - 1; l++)
-        {
+        for (size_t l = 0; l < largeItemsetsVector[i].size() - 1; l++) {
             std::cout << largeItemsetsVector[i][l] << ", ";
         }
-        std::cout << largeItemsetsVector[i][largeItemsetsVector[i].size() - 1] << "}\t\t";
+        std::cout << largeItemsetsVector[i][largeItemsetsVector[i].size() - 1]
+                  << "}\t\t";
 
         std::cout << supportVector[i] << std::endl;
     }
@@ -182,74 +204,76 @@ void printAprioriItemsets(NumericTablePtr largeItemsetsTable, NumericTablePtr la
     largeItemsetsSupportTable->releaseBlockOfRows(block2);
 }
 
-void printAprioriRules(NumericTablePtr leftItemsTable, NumericTablePtr rightItemsTable, NumericTablePtr confidenceTable, size_t nRulesToPrint = 20)
-{
-    size_t nRules      = confidenceTable->getNumberOfRows();
-    size_t nLeftItems  = leftItemsTable->getNumberOfRows();
+void printAprioriRules(NumericTablePtr leftItemsTable,
+                       NumericTablePtr rightItemsTable,
+                       NumericTablePtr confidenceTable,
+                       size_t nRulesToPrint = 20) {
+    size_t nRules = confidenceTable->getNumberOfRows();
+    size_t nLeftItems = leftItemsTable->getNumberOfRows();
     size_t nRightItems = rightItemsTable->getNumberOfRows();
 
     BlockDescriptor<int> block1;
     leftItemsTable->getBlockOfRows(0, nLeftItems, readOnly, block1);
-    int * leftItems = block1.getBlockPtr();
+    int *leftItems = block1.getBlockPtr();
 
     BlockDescriptor<int> block2;
     rightItemsTable->getBlockOfRows(0, nRightItems, readOnly, block2);
-    int * rightItems = block2.getBlockPtr();
+    int *rightItems = block2.getBlockPtr();
 
     BlockDescriptor<DAAL_DATA_TYPE> block3;
     confidenceTable->getBlockOfRows(0, nRules, readOnly, block3);
-    DAAL_DATA_TYPE * confidence = block3.getBlockPtr();
+    DAAL_DATA_TYPE *confidence = block3.getBlockPtr();
 
-    std::vector<std::vector<size_t> > leftItemsVector;
+    std::vector<std::vector<size_t>> leftItemsVector;
     leftItemsVector.resize(nRules);
 
-    if (nRules == 0)
-    {
-        std::cout << std::endl << "No association rules were found " << std::endl;
+    if (nRules == 0) {
+        std::cout << std::endl
+                  << "No association rules were found " << std::endl;
         return;
     }
 
-    for (size_t i = 0; i < nLeftItems; i++)
-    {
+    for (size_t i = 0; i < nLeftItems; i++) {
         leftItemsVector[leftItems[2 * i]].push_back(leftItems[2 * i + 1]);
     }
 
-    std::vector<std::vector<size_t> > rightItemsVector;
+    std::vector<std::vector<size_t>> rightItemsVector;
     rightItemsVector.resize(nRules);
 
-    for (size_t i = 0; i < nRightItems; i++)
-    {
+    for (size_t i = 0; i < nRightItems; i++) {
         rightItemsVector[rightItems[2 * i]].push_back(rightItems[2 * i + 1]);
     }
 
     std::vector<DAAL_DATA_TYPE> confidenceVector;
     confidenceVector.resize(nRules);
 
-    for (size_t i = 0; i < nRules; i++)
-    {
+    for (size_t i = 0; i < nRules; i++) {
         confidenceVector[i] = confidence[i];
     }
 
-    std::cout << std::endl << "Last " << nRulesToPrint << " association rules: " << std::endl;
+    std::cout << std::endl
+              << "Last " << nRulesToPrint
+              << " association rules: " << std::endl;
     std::cout << std::endl
               << "Rule"
               << "\t\t\t\tConfidence" << std::endl;
-    size_t iMin = (((nRules > nRulesToPrint) && (nRulesToPrint != 0)) ? (nRules - nRulesToPrint) : 0);
+    size_t iMin = (((nRules > nRulesToPrint) && (nRulesToPrint != 0))
+                       ? (nRules - nRulesToPrint)
+                       : 0);
 
-    for (size_t i = iMin; i < nRules; i++)
-    {
+    for (size_t i = iMin; i < nRules; i++) {
         std::cout << "{";
-        for (size_t l = 0; l < leftItemsVector[i].size() - 1; l++)
-        {
+        for (size_t l = 0; l < leftItemsVector[i].size() - 1; l++) {
             std::cout << leftItemsVector[i][l] << ", ";
         }
-        std::cout << leftItemsVector[i][leftItemsVector[i].size() - 1] << "} => {";
+        std::cout << leftItemsVector[i][leftItemsVector[i].size() - 1]
+                  << "} => {";
 
-        for (size_t l = 0; l < rightItemsVector[i].size() - 1; l++)
-        {
+        for (size_t l = 0; l < rightItemsVector[i].size() - 1; l++) {
             std::cout << rightItemsVector[i][l] << ", ";
         }
-        std::cout << rightItemsVector[i][rightItemsVector[i].size() - 1] << "}\t\t";
+        std::cout << rightItemsVector[i][rightItemsVector[i].size() - 1]
+                  << "}\t\t";
 
         std::cout << confidenceVector[i] << std::endl;
     }
@@ -259,44 +283,41 @@ void printAprioriRules(NumericTablePtr leftItemsTable, NumericTablePtr rightItem
     confidenceTable->releaseBlockOfRows(block3);
 }
 
-bool isFull(NumericTableIface::StorageLayout layout)
-{
+bool isFull(NumericTableIface::StorageLayout layout) {
     int layoutInt = (int)layout;
-    if (packed_mask & layoutInt)
-    {
+    if (packed_mask & layoutInt) {
         return false;
     }
     return true;
 }
 
-bool isUpper(NumericTableIface::StorageLayout layout)
-{
-    if (layout == NumericTableIface::upperPackedSymmetricMatrix || layout == NumericTableIface::upperPackedTriangularMatrix)
-    {
+bool isUpper(NumericTableIface::StorageLayout layout) {
+    if (layout == NumericTableIface::upperPackedSymmetricMatrix ||
+        layout == NumericTableIface::upperPackedTriangularMatrix) {
         return true;
     }
     return false;
 }
 
-bool isLower(NumericTableIface::StorageLayout layout)
-{
-    if (layout == NumericTableIface::lowerPackedSymmetricMatrix || layout == NumericTableIface::lowerPackedTriangularMatrix)
-    {
+bool isLower(NumericTableIface::StorageLayout layout) {
+    if (layout == NumericTableIface::lowerPackedSymmetricMatrix ||
+        layout == NumericTableIface::lowerPackedTriangularMatrix) {
         return true;
     }
     return false;
 }
 
 template <typename T>
-void printArray(T * array, const size_t nPrintedCols, const size_t nPrintedRows, const size_t nCols, const std::string& message, size_t interval = 10)
-{
+void printArray(T *array, const size_t nPrintedCols, const size_t nPrintedRows,
+                const size_t nCols, const std::string &message,
+                size_t interval = 10) {
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
-    for (size_t i = 0; i < nPrintedRows; i++)
-    {
-        for (size_t j = 0; j < nPrintedCols; j++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+    for (size_t i = 0; i < nPrintedRows; i++) {
+        for (size_t j = 0; j < nPrintedCols; j++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << array[i * nCols + j];
         }
         std::cout << std::endl;
@@ -305,22 +326,22 @@ void printArray(T * array, const size_t nPrintedCols, const size_t nPrintedRows,
 }
 
 template <typename T>
-void printArray(T * array, const size_t nCols, const size_t nRows, const std::string& message, size_t interval = 10)
-{
+void printArray(T *array, const size_t nCols, const size_t nRows,
+                const std::string &message, size_t interval = 10) {
     printArray(array, nCols, nRows, nCols, message, interval);
 }
 
 template <typename T>
-void printLowerArray(T * array, const size_t nPrintedRows, const std::string& message, size_t interval = 10)
-{
+void printLowerArray(T *array, const size_t nPrintedRows,
+                     const std::string &message, size_t interval = 10) {
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
     int ind = 0;
-    for (size_t i = 0; i < nPrintedRows; i++)
-    {
-        for (size_t j = 0; j <= i; j++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+    for (size_t i = 0; i < nPrintedRows; i++) {
+        for (size_t j = 0; j <= i; j++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << array[ind++];
         }
         std::cout << std::endl;
@@ -329,24 +350,23 @@ void printLowerArray(T * array, const size_t nPrintedRows, const std::string& me
 }
 
 template <typename T>
-void printUpperArray(T * array, const size_t nPrintedCols, const size_t nPrintedRows, const size_t nCols, const std::string& message, size_t interval = 10)
-{
+void printUpperArray(T *array, const size_t nPrintedCols,
+                     const size_t nPrintedRows, const size_t nCols,
+                     const std::string &message, size_t interval = 10) {
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
     int ind = 0;
-    for (size_t i = 0; i < nPrintedRows; i++)
-    {
-        for (size_t j = 0; j < i; j++)
-        {
+    for (size_t i = 0; i < nPrintedRows; i++) {
+        for (size_t j = 0; j < i; j++) {
             std::cout << "          ";
         }
-        for (size_t j = i; j < nPrintedCols; j++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+        for (size_t j = i; j < nPrintedCols; j++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << array[ind++];
         }
-        for (size_t j = nPrintedCols; j < nCols; j++)
-        {
+        for (size_t j = nPrintedCols; j < nCols; j++) {
             ind++;
         }
         std::cout << std::endl;
@@ -354,80 +374,77 @@ void printUpperArray(T * array, const size_t nPrintedCols, const size_t nPrinted
     std::cout << std::endl;
 }
 
-void printNumericTable(NumericTable * dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0, size_t interval = 10)
-{
-    size_t nRows                            = dataTable->getNumberOfRows();
-    size_t nCols                            = dataTable->getNumberOfColumns();
+void printNumericTable(NumericTable *dataTable, const char *message = "",
+                       size_t nPrintedRows = 0, size_t nPrintedCols = 0,
+                       size_t interval = 10) {
+    size_t nRows = dataTable->getNumberOfRows();
+    size_t nCols = dataTable->getNumberOfColumns();
     NumericTableIface::StorageLayout layout = dataTable->getDataLayout();
 
-    if (nPrintedRows != 0)
-    {
+    if (nPrintedRows != 0) {
         nPrintedRows = std::min(nRows, nPrintedRows);
-    }
-    else
-    {
+    } else {
         nPrintedRows = nRows;
     }
 
-    if (nPrintedCols != 0)
-    {
+    if (nPrintedCols != 0) {
         nPrintedCols = std::min(nCols, nPrintedCols);
-    }
-    else
-    {
+    } else {
         nPrintedCols = nCols;
     }
 
     BlockDescriptor<DAAL_DATA_TYPE> block;
-    if (isFull(layout) || layout == NumericTableIface::csrArray)
-    {
+    if (isFull(layout) || layout == NumericTableIface::csrArray) {
         dataTable->getBlockOfRows(0, nRows, readOnly, block);
-        printArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
+        printArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedCols,
+                                   nPrintedRows, nCols, message, interval);
         dataTable->releaseBlockOfRows(block);
-    }
-    else
-    {
-        PackedArrayNumericTableIface * packedTable = dynamic_cast<PackedArrayNumericTableIface *>(dataTable);
+    } else {
+        PackedArrayNumericTableIface *packedTable =
+            dynamic_cast<PackedArrayNumericTableIface *>(dataTable);
         packedTable->getPackedArray(readOnly, block);
-        if (isLower(layout))
-        {
-            printLowerArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedRows, message, interval);
-        }
-        else if (isUpper(layout))
-        {
-            printUpperArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
+        if (isLower(layout)) {
+            printLowerArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedRows,
+                                            message, interval);
+        } else if (isUpper(layout)) {
+            printUpperArray<DAAL_DATA_TYPE>(block.getBlockPtr(), nPrintedCols,
+                                            nPrintedRows, nCols, message,
+                                            interval);
         }
         packedTable->releasePackedArray(block);
     }
 }
 
-void printNumericTable(NumericTable & dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0, size_t interval = 10)
-{
-    printNumericTable(&dataTable, message, nPrintedRows, nPrintedCols, interval);
+void printNumericTable(NumericTable &dataTable, const char *message = "",
+                       size_t nPrintedRows = 0, size_t nPrintedCols = 0,
+                       size_t interval = 10) {
+    printNumericTable(&dataTable, message, nPrintedRows, nPrintedCols,
+                      interval);
 }
 
-void printNumericTable(const NumericTablePtr & dataTable, const char * message, size_t nPrintedRows, size_t nPrintedCols,
-                       size_t interval)
-{
-    printNumericTable(dataTable.get(), message, nPrintedRows, nPrintedCols, interval);
+void printNumericTable(const NumericTablePtr &dataTable, const char *message,
+                       size_t nPrintedRows, size_t nPrintedCols,
+                       size_t interval) {
+    printNumericTable(dataTable.get(), message, nPrintedRows, nPrintedCols,
+                      interval);
 }
 
-void printPackedNumericTable(NumericTable * dataTable, size_t nFeatures, const char * message = "", size_t interval = 10)
-{
+void printPackedNumericTable(NumericTable *dataTable, size_t nFeatures,
+                             const char *message = "", size_t interval = 10) {
     BlockDescriptor<DAAL_DATA_TYPE> block;
 
     dataTable->getBlockOfRows(0, 1, readOnly, block);
 
-    DAAL_DATA_TYPE * data = block.getBlockPtr();
+    DAAL_DATA_TYPE *data = block.getBlockPtr();
 
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
     size_t index = 0;
-    for (size_t i = 0; i < nFeatures; i++)
-    {
-        for (size_t j = 0; j <= i; j++, index++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+    for (size_t i = 0; i < nFeatures; i++) {
+        for (size_t j = 0; j <= i; j++, index++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << data[index];
         }
         std::cout << std::endl;
@@ -437,15 +454,16 @@ void printPackedNumericTable(NumericTable * dataTable, size_t nFeatures, const c
     dataTable->releaseBlockOfRows(block);
 }
 
-void printPackedNumericTable(NumericTable & dataTable, size_t nFeatures, const char * message = "", size_t interval = 10)
-{
+void printPackedNumericTable(NumericTable &dataTable, size_t nFeatures,
+                             const char *message = "", size_t interval = 10) {
     printPackedNumericTable(&dataTable, nFeatures, message);
 }
 
 template <typename type1, typename type2>
-void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, const char * title1 = "", const char * title2 = "",
-                        const char * message = "", size_t nPrintedRows = 0, size_t interval = 15)
-{
+void printNumericTables(NumericTable *dataTable1, NumericTable *dataTable2,
+                        const char *title1 = "", const char *title2 = "",
+                        const char *message = "", size_t nPrintedRows = 0,
+                        size_t interval = 15) {
     size_t nRows1 = dataTable1->getNumberOfRows();
     size_t nRows2 = dataTable2->getNumberOfRows();
     size_t nCols1 = dataTable1->getNumberOfColumns();
@@ -455,31 +473,30 @@ void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, co
     BlockDescriptor<type2> block2;
 
     size_t nRows = std::min(nRows1, nRows2);
-    if (nPrintedRows != 0)
-    {
+    if (nPrintedRows != 0) {
         nRows = std::min(std::min(nRows1, nRows2), nPrintedRows);
     }
 
     dataTable1->getBlockOfRows(0, nRows, readOnly, block1);
     dataTable2->getBlockOfRows(0, nRows, readOnly, block2);
 
-    type1 * data1 = block1.getBlockPtr();
-    type2 * data2 = block2.getBlockPtr();
+    type1 *data1 = block1.getBlockPtr();
+    type2 *data2 = block2.getBlockPtr();
 
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
     std::cout << std::setw(interval * nCols1) << title1;
     std::cout << std::setw(interval * nCols2) << title2 << std::endl;
-    for (size_t i = 0; i < nRows; i++)
-    {
-        for (size_t j = 0; j < nCols1; j++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+    for (size_t i = 0; i < nRows; i++) {
+        for (size_t j = 0; j < nCols1; j++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << data1[i * nCols1 + j];
         }
-        for (size_t j = 0; j < nCols2; j++)
-        {
-            std::cout << std::setprecision(0) << std::setw(interval) << data2[i * nCols2 + j];
+        for (size_t j = 0; j < nCols2; j++) {
+            std::cout << std::setprecision(0) << std::setw(interval)
+                      << data2[i * nCols2 + j];
         }
         std::cout << std::endl;
     }
@@ -490,15 +507,18 @@ void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, co
 }
 
 template <typename type1, typename type2>
-void printNumericTables(NumericTable * dataTable1, NumericTable & dataTable2, const char * title1 = "", const char * title2 = "",
-                        const char * message = "", size_t nPrintedRows = 0, size_t interval = 10)
-{
-    printNumericTables<type1, type2>(dataTable1, &dataTable2, title1, title2, message, nPrintedRows, interval);
-}
-
-void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, const char * title1 = "", const char * title2 = "",
-                        const char * message = "", size_t nPrintedRows = 0, size_t interval = 10)
-{
+void printNumericTables(NumericTable *dataTable1, NumericTable &dataTable2,
+                        const char *title1 = "", const char *title2 = "",
+                        const char *message = "", size_t nPrintedRows = 0,
+                        size_t interval = 10) {
+    printNumericTables<type1, type2>(dataTable1, &dataTable2, title1, title2,
+                                     message, nPrintedRows, interval);
+}
+
+void printNumericTables(NumericTable *dataTable1, NumericTable *dataTable2,
+                        const char *title1 = "", const char *title2 = "",
+                        const char *message = "", size_t nPrintedRows = 0,
+                        size_t interval = 10) {
     size_t nRows1 = dataTable1->getNumberOfRows();
     size_t nRows2 = dataTable2->getNumberOfRows();
     size_t nCols1 = dataTable1->getNumberOfColumns();
@@ -508,31 +528,30 @@ void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, co
     BlockDescriptor<DAAL_DATA_TYPE> block2;
 
     size_t nRows = std::min(nRows1, nRows2);
-    if (nPrintedRows != 0)
-    {
+    if (nPrintedRows != 0) {
         nRows = std::min(std::min(nRows1, nRows2), nPrintedRows);
     }
 
     dataTable1->getBlockOfRows(0, nRows, readOnly, block1);
     dataTable2->getBlockOfRows(0, nRows, readOnly, block2);
 
-    DAAL_DATA_TYPE * data1 = block1.getBlockPtr();
-    DAAL_DATA_TYPE * data2 = block2.getBlockPtr();
+    DAAL_DATA_TYPE *data1 = block1.getBlockPtr();
+    DAAL_DATA_TYPE *data2 = block2.getBlockPtr();
 
     std::cout << std::setiosflags(std::ios::left);
     std::cout << message << std::endl;
     std::cout << std::setw(interval * nCols1) << title1;
     std::cout << std::setw(interval * nCols2) << title2 << std::endl;
-    for (size_t i = 0; i < nRows; i++)
-    {
-        for (size_t j = 0; j < nCols1; j++)
-        {
-            std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed) << std::setprecision(3);
+    for (size_t i = 0; i < nRows; i++) {
+        for (size_t j = 0; j < nCols1; j++) {
+            std::cout << std::setw(interval)
+                      << std::setiosflags(std::ios::fixed)
+                      << std::setprecision(3);
             std::cout << data1[i * nCols1 + j];
         }
-        for (size_t j = 0; j < nCols2; j++)
-        {
-            std::cout << std::setprecision(0) << std::setw(interval) << data2[i * nCols2 + j];
+        for (size_t j = 0; j < nCols2; j++) {
+            std::cout << std::setprecision(0) << std::setw(interval)
+                      << data2[i * nCols2 + j];
         }
         std::cout << std::endl;
     }
@@ -542,112 +561,91 @@ void printNumericTables(NumericTable * dataTable1, NumericTable * dataTable2, co
     dataTable2->releaseBlockOfRows(block2);
 }
 
-void printNumericTables(NumericTable * dataTable1, NumericTable & dataTable2, const char * title1 = "", const char * title2 = "",
-                        const char * message = "", size_t nPrintedRows = 0, size_t interval = 10)
-{
-    printNumericTables(dataTable1, &dataTable2, title1, title2, message, nPrintedRows, interval);
+void printNumericTables(NumericTable *dataTable1, NumericTable &dataTable2,
+                        const char *title1 = "", const char *title2 = "",
+                        const char *message = "", size_t nPrintedRows = 0,
+                        size_t interval = 10) {
+    printNumericTables(dataTable1, &dataTable2, title1, title2, message,
+                       nPrintedRows, interval);
 }
 
 template <typename type1, typename type2>
-void printNumericTables(NumericTablePtr dataTable1, NumericTablePtr dataTable2, const char * title1 = "", const char * title2 = "",
-                        const char * message = "", size_t nPrintedRows = 0, size_t interval = 10)
-{
-    printNumericTables<type1, type2>(dataTable1.get(), dataTable2.get(), title1, title2, message, nPrintedRows, interval);
+void printNumericTables(NumericTablePtr dataTable1, NumericTablePtr dataTable2,
+                        const char *title1 = "", const char *title2 = "",
+                        const char *message = "", size_t nPrintedRows = 0,
+                        size_t interval = 10) {
+    printNumericTables<type1, type2>(dataTable1.get(), dataTable2.get(), title1,
+                                     title2, message, nPrintedRows, interval);
 }
 
-bool checkFileIsAvailable(std::string filename, bool needExit = false)
-{
+bool checkFileIsAvailable(std::string filename, bool needExit = false) {
     std::ifstream file(filename.c_str());
-    if (file.good())
-    {
+    if (file.good()) {
         return true;
-    }
-    else
-    {
+    } else {
         std::cout << "Can't open file " << filename << std::endl;
-        if (needExit)
-        {
+        if (needExit) {
             exit(fileError);
         }
         return false;
     }
 }
 
-void checkArguments(int argc, char * argv[], int count, ...)
-{
-    std::string ** filelist = new std::string *[count];
+void checkArguments(int argc, char *argv[], int count, ...) {
+    std::string **filelist = new std::string *[count];
     va_list ap;
     va_start(ap, count);
-    for (int i = 0; i < count; i++)
-    {
+    for (int i = 0; i < count; i++) {
         filelist[i] = va_arg(ap, std::string *);
     }
     va_end(ap);
-    if (argc == 1)
-    {
-        for (int i = 0; i < count; i++)
-        {
+    if (argc == 1) {
+        for (int i = 0; i < count; i++) {
             checkFileIsAvailable(*(filelist[i]), true);
         }
-    }
-    else if (argc == (count + 1))
-    {
+    } else if (argc == (count + 1)) {
         bool isAllCorrect = true;
-        for (int i = 0; i < count; i++)
-        {
-            if (!checkFileIsAvailable(argv[i + 1]))
-            {
+        for (int i = 0; i < count; i++) {
+            if (!checkFileIsAvailable(argv[i + 1])) {
                 isAllCorrect = false;
                 break;
             }
         }
-        if (isAllCorrect == true)
-        {
-            for (int i = 0; i < count; i++)
-            {
+        if (isAllCorrect == true) {
+            for (int i = 0; i < count; i++) {
                 (*filelist[i]) = argv[i + 1];
             }
-        }
-        else
-        {
-            std::cout << "Warning: Try to open default datasetFileNames" << std::endl;
-            for (int i = 0; i < count; i++)
-            {
+        } else {
+            std::cout << "Warning: Try to open default datasetFileNames"
+                      << std::endl;
+            for (int i = 0; i < count; i++) {
                 checkFileIsAvailable(*(filelist[i]), true);
             }
         }
-    }
-    else
-    {
+    } else {
         std::cout << "Usage: " << argv[0] << " [ ";
-        for (int i = 0; i < count; i++)
-        {
+        for (int i = 0; i < count; i++) {
             std::cout << "<filename_" << i << "> ";
         }
         std::cout << "]" << std::endl;
-        std::cout << "Warning: Try to open default datasetFileNames" << std::endl;
-        for (int i = 0; i < count; i++)
-        {
+        std::cout << "Warning: Try to open default datasetFileNames"
+                  << std::endl;
+        for (int i = 0; i < count; i++) {
             checkFileIsAvailable(*(filelist[i]), true);
         }
     }
     delete[] filelist;
 }
 
-void copyBytes(daal::byte * dst, daal::byte * src, size_t size)
-{
-    for (size_t i = 0; i < size; i++)
-    {
+void copyBytes(daal::byte *dst, daal::byte *src, size_t size) {
+    for (size_t i = 0; i < size; i++) {
         dst[i] = src[i];
     }
 }
 
-size_t checkBytes(daal::byte * dst, daal::byte * src, size_t size)
-{
-    for (size_t i = 0; i < size; i++)
-    {
-        if (dst[i] != src[i])
-        {
+size_t checkBytes(daal::byte *dst, daal::byte *src, size_t size) {
+    for (size_t i = 0; i < size; i++) {
+        if (dst[i] != src[i]) {
             return i + 1;
         }
     }
@@ -655,34 +653,53 @@ size_t checkBytes(daal::byte * dst, daal::byte * src, size_t size)
 }
 
 static const unsigned int crcRem[] = {
-    0x00000000, 0x741B8CD6, 0xE83719AC, 0x9C2C957A, 0xA475BF8E, 0xD06E3358, 0x4C42A622, 0x38592AF4, 0x3CF0F3CA, 0x48EB7F1C, 0xD4C7EA66, 0xA0DC66B0,
-    0x98854C44, 0xEC9EC092, 0x70B255E8, 0x04A9D93E, 0x79E1E794, 0x0DFA6B42, 0x91D6FE38, 0xE5CD72EE, 0xDD94581A, 0xA98FD4CC, 0x35A341B6, 0x41B8CD60,
-    0x4511145E, 0x310A9888, 0xAD260DF2, 0xD93D8124, 0xE164ABD0, 0x957F2706, 0x0953B27C, 0x7D483EAA, 0xF3C3CF28, 0x87D843FE, 0x1BF4D684, 0x6FEF5A52,
-    0x57B670A6, 0x23ADFC70, 0xBF81690A, 0xCB9AE5DC, 0xCF333CE2, 0xBB28B034, 0x2704254E, 0x531FA998, 0x6B46836C, 0x1F5D0FBA, 0x83719AC0, 0xF76A1616,
-    0x8A2228BC, 0xFE39A46A, 0x62153110, 0x160EBDC6, 0x2E579732, 0x5A4C1BE4, 0xC6608E9E, 0xB27B0248, 0xB6D2DB76, 0xC2C957A0, 0x5EE5C2DA, 0x2AFE4E0C,
-    0x12A764F8, 0x66BCE82E, 0xFA907D54, 0x8E8BF182, 0x939C1286, 0xE7879E50, 0x7BAB0B2A, 0x0FB087FC, 0x37E9AD08, 0x43F221DE, 0xDFDEB4A4, 0xABC53872,
-    0xAF6CE14C, 0xDB776D9A, 0x475BF8E0, 0x33407436, 0x0B195EC2, 0x7F02D214, 0xE32E476E, 0x9735CBB8, 0xEA7DF512, 0x9E6679C4, 0x024AECBE, 0x76516068,
-    0x4E084A9C, 0x3A13C64A, 0xA63F5330, 0xD224DFE6, 0xD68D06D8, 0xA2968A0E, 0x3EBA1F74, 0x4AA193A2, 0x72F8B956, 0x06E33580, 0x9ACFA0FA, 0xEED42C2C,
-    0x605FDDAE, 0x14445178, 0x8868C402, 0xFC7348D4, 0xC42A6220, 0xB031EEF6, 0x2C1D7B8C, 0x5806F75A, 0x5CAF2E64, 0x28B4A2B2, 0xB49837C8, 0xC083BB1E,
-    0xF8DA91EA, 0x8CC11D3C, 0x10ED8846, 0x64F60490, 0x19BE3A3A, 0x6DA5B6EC, 0xF1892396, 0x8592AF40, 0xBDCB85B4, 0xC9D00962, 0x55FC9C18, 0x21E710CE,
-    0x254EC9F0, 0x51554526, 0xCD79D05C, 0xB9625C8A, 0x813B767E, 0xF520FAA8, 0x690C6FD2, 0x1D17E304, 0x5323A9DA, 0x2738250C, 0xBB14B076, 0xCF0F3CA0,
-    0xF7561654, 0x834D9A82, 0x1F610FF8, 0x6B7A832E, 0x6FD35A10, 0x1BC8D6C6, 0x87E443BC, 0xF3FFCF6A, 0xCBA6E59E, 0xBFBD6948, 0x2391FC32, 0x578A70E4,
-    0x2AC24E4E, 0x5ED9C298, 0xC2F557E2, 0xB6EEDB34, 0x8EB7F1C0, 0xFAAC7D16, 0x6680E86C, 0x129B64BA, 0x1632BD84, 0x62293152, 0xFE05A428, 0x8A1E28FE,
-    0xB247020A, 0xC65C8EDC, 0x5A701BA6, 0x2E6B9770, 0xA0E066F2, 0xD4FBEA24, 0x48D77F5E, 0x3CCCF388, 0x0495D97C, 0x708E55AA, 0xECA2C0D0, 0x98B94C06,
-    0x9C109538, 0xE80B19EE, 0x74278C94, 0x003C0042, 0x38652AB6, 0x4C7EA660, 0xD052331A, 0xA449BFCC, 0xD9018166, 0xAD1A0DB0, 0x313698CA, 0x452D141C,
-    0x7D743EE8, 0x096FB23E, 0x95432744, 0xE158AB92, 0xE5F172AC, 0x91EAFE7A, 0x0DC66B00, 0x79DDE7D6, 0x4184CD22, 0x359F41F4, 0xA9B3D48E, 0xDDA85858,
-    0xC0BFBB5C, 0xB4A4378A, 0x2888A2F0, 0x5C932E26, 0x64CA04D2, 0x10D18804, 0x8CFD1D7E, 0xF8E691A8, 0xFC4F4896, 0x8854C440, 0x1478513A, 0x6063DDEC,
-    0x583AF718, 0x2C217BCE, 0xB00DEEB4, 0xC4166262, 0xB95E5CC8, 0xCD45D01E, 0x51694564, 0x2572C9B2, 0x1D2BE346, 0x69306F90, 0xF51CFAEA, 0x8107763C,
-    0x85AEAF02, 0xF1B523D4, 0x6D99B6AE, 0x19823A78, 0x21DB108C, 0x55C09C5A, 0xC9EC0920, 0xBDF785F6, 0x337C7474, 0x4767F8A2, 0xDB4B6DD8, 0xAF50E10E,
-    0x9709CBFA, 0xE312472C, 0x7F3ED256, 0x0B255E80, 0x0F8C87BE, 0x7B970B68, 0xE7BB9E12, 0x93A012C4, 0xABF93830, 0xDFE2B4E6, 0x43CE219C, 0x37D5AD4A,
-    0x4A9D93E0, 0x3E861F36, 0xA2AA8A4C, 0xD6B1069A, 0xEEE82C6E, 0x9AF3A0B8, 0x06DF35C2, 0x72C4B914, 0x766D602A, 0x0276ECFC, 0x9E5A7986, 0xEA41F550,
-    0xD218DFA4, 0xA6035372, 0x3A2FC608, 0x4E344ADE
-};
-
-unsigned int getCRC32(daal::byte * input, unsigned int prevRes, size_t len)
-{
+    0x00000000, 0x741B8CD6, 0xE83719AC, 0x9C2C957A, 0xA475BF8E, 0xD06E3358,
+    0x4C42A622, 0x38592AF4, 0x3CF0F3CA, 0x48EB7F1C, 0xD4C7EA66, 0xA0DC66B0,
+    0x98854C44, 0xEC9EC092, 0x70B255E8, 0x04A9D93E, 0x79E1E794, 0x0DFA6B42,
+    0x91D6FE38, 0xE5CD72EE, 0xDD94581A, 0xA98FD4CC, 0x35A341B6, 0x41B8CD60,
+    0x4511145E, 0x310A9888, 0xAD260DF2, 0xD93D8124, 0xE164ABD0, 0x957F2706,
+    0x0953B27C, 0x7D483EAA, 0xF3C3CF28, 0x87D843FE, 0x1BF4D684, 0x6FEF5A52,
+    0x57B670A6, 0x23ADFC70, 0xBF81690A, 0xCB9AE5DC, 0xCF333CE2, 0xBB28B034,
+    0x2704254E, 0x531FA998, 0x6B46836C, 0x1F5D0FBA, 0x83719AC0, 0xF76A1616,
+    0x8A2228BC, 0xFE39A46A, 0x62153110, 0x160EBDC6, 0x2E579732, 0x5A4C1BE4,
+    0xC6608E9E, 0xB27B0248, 0xB6D2DB76, 0xC2C957A0, 0x5EE5C2DA, 0x2AFE4E0C,
+    0x12A764F8, 0x66BCE82E, 0xFA907D54, 0x8E8BF182, 0x939C1286, 0xE7879E50,
+    0x7BAB0B2A, 0x0FB087FC, 0x37E9AD08, 0x43F221DE, 0xDFDEB4A4, 0xABC53872,
+    0xAF6CE14C, 0xDB776D9A, 0x475BF8E0, 0x33407436, 0x0B195EC2, 0x7F02D214,
+    0xE32E476E, 0x9735CBB8, 0xEA7DF512, 0x9E6679C4, 0x024AECBE, 0x76516068,
+    0x4E084A9C, 0x3A13C64A, 0xA63F5330, 0xD224DFE6, 0xD68D06D8, 0xA2968A0E,
+    0x3EBA1F74, 0x4AA193A2, 0x72F8B956, 0x06E33580, 0x9ACFA0FA, 0xEED42C2C,
+    0x605FDDAE, 0x14445178, 0x8868C402, 0xFC7348D4, 0xC42A6220, 0xB031EEF6,
+    0x2C1D7B8C, 0x5806F75A, 0x5CAF2E64, 0x28B4A2B2, 0xB49837C8, 0xC083BB1E,
+    0xF8DA91EA, 0x8CC11D3C, 0x10ED8846, 0x64F60490, 0x19BE3A3A, 0x6DA5B6EC,
+    0xF1892396, 0x8592AF40, 0xBDCB85B4, 0xC9D00962, 0x55FC9C18, 0x21E710CE,
+    0x254EC9F0, 0x51554526, 0xCD79D05C, 0xB9625C8A, 0x813B767E, 0xF520FAA8,
+    0x690C6FD2, 0x1D17E304, 0x5323A9DA, 0x2738250C, 0xBB14B076, 0xCF0F3CA0,
+    0xF7561654, 0x834D9A82, 0x1F610FF8, 0x6B7A832E, 0x6FD35A10, 0x1BC8D6C6,
+    0x87E443BC, 0xF3FFCF6A, 0xCBA6E59E, 0xBFBD6948, 0x2391FC32, 0x578A70E4,
+    0x2AC24E4E, 0x5ED9C298, 0xC2F557E2, 0xB6EEDB34, 0x8EB7F1C0, 0xFAAC7D16,
+    0x6680E86C, 0x129B64BA, 0x1632BD84, 0x62293152, 0xFE05A428, 0x8A1E28FE,
+    0xB247020A, 0xC65C8EDC, 0x5A701BA6, 0x2E6B9770, 0xA0E066F2, 0xD4FBEA24,
+    0x48D77F5E, 0x3CCCF388, 0x0495D97C, 0x708E55AA, 0xECA2C0D0, 0x98B94C06,
+    0x9C109538, 0xE80B19EE, 0x74278C94, 0x003C0042, 0x38652AB6, 0x4C7EA660,
+    0xD052331A, 0xA449BFCC, 0xD9018166, 0xAD1A0DB0, 0x313698CA, 0x452D141C,
+    0x7D743EE8, 0x096FB23E, 0x95432744, 0xE158AB92, 0xE5F172AC, 0x91EAFE7A,
+    0x0DC66B00, 0x79DDE7D6, 0x4184CD22, 0x359F41F4, 0xA9B3D48E, 0xDDA85858,
+    0xC0BFBB5C, 0xB4A4378A, 0x2888A2F0, 0x5C932E26, 0x64CA04D2, 0x10D18804,
+    0x8CFD1D7E, 0xF8E691A8, 0xFC4F4896, 0x8854C440, 0x1478513A, 0x6063DDEC,
+    0x583AF718, 0x2C217BCE, 0xB00DEEB4, 0xC4166262, 0xB95E5CC8, 0xCD45D01E,
+    0x51694564, 0x2572C9B2, 0x1D2BE346, 0x69306F90, 0xF51CFAEA, 0x8107763C,
+    0x85AEAF02, 0xF1B523D4, 0x6D99B6AE, 0x19823A78, 0x21DB108C, 0x55C09C5A,
+    0xC9EC0920, 0xBDF785F6, 0x337C7474, 0x4767F8A2, 0xDB4B6DD8, 0xAF50E10E,
+    0x9709CBFA, 0xE312472C, 0x7F3ED256, 0x0B255E80, 0x0F8C87BE, 0x7B970B68,
+    0xE7BB9E12, 0x93A012C4, 0xABF93830, 0xDFE2B4E6, 0x43CE219C, 0x37D5AD4A,
+    0x4A9D93E0, 0x3E861F36, 0xA2AA8A4C, 0xD6B1069A, 0xEEE82C6E, 0x9AF3A0B8,
+    0x06DF35C2, 0x72C4B914, 0x766D602A, 0x0276ECFC, 0x9E5A7986, 0xEA41F550,
+    0xD218DFA4, 0xA6035372, 0x3A2FC608, 0x4E344ADE};
+
+unsigned int getCRC32(daal::byte *input, unsigned int prevRes, size_t len) {
     size_t i;
-    daal::byte * p;
+    daal::byte *p;
 
     unsigned int res, highDigit, nextDigit;
     const unsigned int crcPoly = 0xBA0DC66B;
@@ -691,30 +708,29 @@ unsigned int getCRC32(daal::byte * input, unsigned int prevRes, size_t len)
 
     res = prevRes;
 
-    for (i = 0; i < len; i++)
-    {
+    for (i = 0; i < len; i++) {
         highDigit = res >> 24;
         nextDigit = (unsigned int)(p[len - 1 - i]);
-        res       = (res << 8) ^ nextDigit;
-        res       = res ^ crcRem[highDigit];
+        res = (res << 8) ^ nextDigit;
+        res = res ^ crcRem[highDigit];
     }
 
-    if (res >= crcPoly)
-    {
+    if (res >= crcPoly) {
         res = res ^ crcPoly;
     }
 
     return res;
 }
 
-void printALSRatings(NumericTablePtr usersOffsetTable, NumericTablePtr itemsOffsetTable, NumericTablePtr ratings)
-{
+void printALSRatings(NumericTablePtr usersOffsetTable,
+                     NumericTablePtr itemsOffsetTable,
+                     NumericTablePtr ratings) {
     size_t nUsers = ratings->getNumberOfRows();
     size_t nItems = ratings->getNumberOfColumns();
 
     BlockDescriptor<DAAL_DATA_TYPE> block1;
     ratings->getBlockOfRows(0, nUsers, readOnly, block1);
-    DAAL_DATA_TYPE * ratingsData = block1.getBlockPtr();
+    DAAL_DATA_TYPE *ratingsData = block1.getBlockPtr();
 
     size_t usersOffset, itemsOffset;
     BlockDescriptor<int> block;
@@ -727,18 +743,16 @@ void printALSRatings(NumericTablePtr usersOffsetTable, NumericTablePtr itemsOffs
     itemsOffsetTable->releaseBlockOfRows(block);
 
     std::cout << " User ID, Item ID, rating" << std::endl;
-    for (size_t i = 0; i < nUsers; i++)
-    {
-        for (size_t j = 0; j < nItems; j++)
-        {
-            std::cout << i + usersOffset << ", " << j + itemsOffset << ", " << ratingsData[i * nItems + j] << std::endl;
+    for (size_t i = 0; i < nUsers; i++) {
+        for (size_t j = 0; j < nItems; j++) {
+            std::cout << i + usersOffset << ", " << j + itemsOffset << ", "
+                      << ratingsData[i * nItems + j] << std::endl;
         }
     }
     ratings->releaseBlockOfRows(block1);
 }
 
-size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer)
-{
+size_t serializeDAALObject(SerializationIface *pData, ByteBuffer &buffer) {
     /* Create a data archive to serialize the numeric table */
     InputDataArchive dataArch;
 
@@ -750,12 +764,12 @@ size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer)
 
     /* Store the serialized data in an array */
     buffer.resize(length);
-    if (length) dataArch.copyArchiveToArray(&buffer[0], length);
+    if (length)
+        dataArch.copyArchiveToArray(&buffer[0], length);
     return length;
 }
 
-SerializationIfacePtr deserializeDAALObject(daal::byte * buff, size_t length)
-{
+SerializationIfacePtr deserializeDAALObject(daal::byte *buff, size_t length) {
     /* Create a data archive to deserialize the object */
     OutputDataArchive dataArch(buff, length);
 
diff --git a/mllib-dal/src/main/native/service.h b/mllib-dal/src/main/native/service.h
index b6a2cc5c5..8696993b5 100644
--- a/mllib-dal/src/main/native/service.h
+++ b/mllib-dal/src/main/native/service.h
@@ -1,19 +1,19 @@
 /* file: service.h */
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright 2017-2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
 /*
 !  Content:
@@ -28,23 +28,24 @@
 using namespace daal::data_management;
 
 #include <algorithm>
-#include <string>
-#include <iostream>
+#include <cstdarg>
 #include <fstream>
-#include <sstream>
 #include <iomanip>
-#include <cstdarg>
-#include <vector>
+#include <iostream>
 #include <queue>
+#include <sstream>
+#include <string>
+#include <vector>
 
 #include "error_handling.h"
 
 typedef std::vector<daal::byte> ByteBuffer;
 
-void printNumericTable(const NumericTablePtr & dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0,
-                       size_t interval = 10);
-size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer);
-SerializationIfacePtr deserializeDAALObject(daal::byte * buff, size_t length);
-CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName);
+void printNumericTable(const NumericTablePtr &dataTable,
+                       const char *message = "", size_t nPrintedRows = 0,
+                       size_t nPrintedCols = 0, size_t interval = 10);
+size_t serializeDAALObject(SerializationIface *pData, ByteBuffer &buffer);
+SerializationIfacePtr deserializeDAALObject(daal::byte *buff, size_t length);
+CSRNumericTable *createFloatSparseTable(const std::string &datasetFileName);
 
 #endif
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/clustering/KMeans.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/clustering/KMeans.scala
index 77eb1e928..dc3dbbd6e 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/clustering/KMeans.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/clustering/KMeans.scala
@@ -334,12 +334,8 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = instrumented { instr =>
     transformSchema(dataset.schema, logging = true)
 
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
+    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE)
     val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
-    val useKMeansDAL =  isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
-
-    // will handle persistence only for trainWithML
-    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
     val w = if (handleWeight) {
       col($(weightCol)).cast(DoubleType)
     } else {
@@ -351,25 +347,19 @@ class KMeans @Since("1.5.0") (
       case Row(point: Vector, weight: Double) => (point, weight)
     }
 
-    if (handlePersistence) {
-      instances.persist(StorageLevel.MEMORY_AND_DISK)
-    }
-
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
     instr.logParams(this, featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
       maxIter, seed, tol, weightCol)
 
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
+    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
+
     val model = if (useKMeansDAL) {
-      val offheapEnabled=instances.sparkContext.getConf.getBoolean("spark.memory.offHeap.enabled", false)
-      if (offheapEnabled) {
-        instances.setName("instancesRDD").persist(StorageLevel.OFF_HEAP)
-      } else {
-        instances.setName("instancesRDD").persist(StorageLevel.MEMORY_AND_DISK)
-      }
-      trainWithDAL(instances)
+      trainWithDAL(instances, handlePersistence)
     } else {
-      trainWithML(instances)
+      trainWithML(instances, handlePersistence)
     }
 
     val summary = new KMeansSummary(
@@ -382,13 +372,12 @@ class KMeans @Since("1.5.0") (
 
     model.setSummary(Some(summary))
     instr.logNamedValue("clusterSizes", summary.clusterSizes)
-    if (handlePersistence) {
-      instances.unpersist()
-    }
+
     model
   }
 
-  private def trainWithDAL(instances: RDD[(Vector, Double)]): KMeansModel = instrumented { instr =>
+  private def trainWithDAL(instances: RDD[(Vector, Double)],
+                           handlePersistence: Boolean): KMeansModel = instrumented { instr =>
 
     val sc = instances.sparkContext
 
@@ -414,12 +403,18 @@ class KMeans @Since("1.5.0") (
     val dataWithNorm = instances.map {
       case (point: Vector, weight: Double) => new VectorWithNorm(point)
     }
+
+    // Cache for init
+    dataWithNorm.persist(StorageLevel.MEMORY_AND_DISK)
+
     val centersWithNorm = if ($(initMode) == "random") {
       mllibKMeans.initRandom(dataWithNorm)
     } else {
       mllibKMeans.initKMeansParallel(dataWithNorm, distanceMeasureInstance)
     }
 
+    dataWithNorm.unpersist()
+
     val centers = centersWithNorm.map(_.vector)
 
     val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
@@ -427,6 +422,10 @@ class KMeans @Since("1.5.0") (
     val strInitMode = $(initMode)
     logInfo(f"Initialization with $strInitMode took $initTimeInSeconds%.3f seconds.")
 
+    if (handlePersistence) {
+      instances.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
     val inputData = instances.map {
       case (point: Vector, weight: Double) => point
     }
@@ -434,31 +433,44 @@ class KMeans @Since("1.5.0") (
     val kmeansDAL = new KMeansDALImpl(getK, getMaxIter, getTol,
       DistanceMeasure.EUCLIDEAN, centers, executor_num, executor_cores)
 
-    val parentModel = kmeansDAL.runWithRDDVector(inputData, Option(instr))
+    val parentModel = kmeansDAL.train(inputData, Option(instr))
 
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
 
-    model
+    if (handlePersistence) {
+      instances.unpersist()
+    }
 
+    model
   }
 
-  private def trainWithML(instances: RDD[(Vector, Double)]): KMeansModel = instrumented { instr =>
-      val oldVectorInstances = instances.map {
-        case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
-      }
-      val algo = new MLlibKMeans()
-        .setK($(k))
-        .setInitializationMode($(initMode))
-        .setInitializationSteps($(initSteps))
-        .setMaxIterations($(maxIter))
-        .setSeed($(seed))
-        .setEpsilon($(tol))
-        .setDistanceMeasure($(distanceMeasure))
-      val parentModel = algo.runWithWeight(oldVectorInstances, Option(instr))
-      val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
-      model
+  private def trainWithML(instances: RDD[(Vector, Double)],
+                          handlePersistence: Boolean): KMeansModel = instrumented { instr =>
+    if (handlePersistence) {
+      instances.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
+    val oldVectorInstances = instances.map {
+      case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
+    }
+    val algo = new MLlibKMeans()
+      .setK($(k))
+      .setInitializationMode($(initMode))
+      .setInitializationSteps($(initSteps))
+      .setMaxIterations($(maxIter))
+      .setSeed($(seed))
+      .setEpsilon($(tol))
+      .setDistanceMeasure($(distanceMeasure))
+    val parentModel = algo.runWithWeight(oldVectorInstances, Option(instr))
+    val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
+
+    if (handlePersistence) {
+      instances.unpersist()
     }
 
+    model
+  }
+
   @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema)
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/feature/PCA.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/feature/PCA.scala
index 0c9c8ad9e..14e9a2ce1 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/feature/PCA.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/feature/PCA.scala
@@ -96,14 +96,15 @@ class PCA @Since("1.5.0") (
       s"source vector size $numFeatures must be no less than k=$k")
 
     val sc = dataset.sparkSession.sparkContext
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
 
     // Call oneDAL Correlation PCA implementation when numFeatures < 65535 and fall back otherwise
     val parentModel  = if (numFeatures < 65535 && isPlatformSupported) {
       val executor_num = Utils.sparkExecutorNum(dataset.sparkSession.sparkContext)
       val executor_cores = Utils.sparkExecutorCores()
       val pca = new PCADALImpl(k = $(k), executor_num, executor_cores)
-      val pcaModel = pca.fitWithDAL(inputVectors)
+      val pcaModel = pca.train(inputVectors)
       pcaModel
     } else {
       val inputOldVectors = inputVectors.map {
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/recommendation/ALS.scala
index 9196873fb..e59c642c9 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/recommendation/ALS.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.0/ml/recommendation/ALS.scala
@@ -923,7 +923,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
 
     val (userIdAndFactors, itemIdAndFactors) =
       if (implicitPrefs && isPlatformSupported) {
-        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
+        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).train()
       } else {
         trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
           alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/clustering/KMeans.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/clustering/KMeans.scala
index 1aa016af7..7af0ffacf 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/clustering/KMeans.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/clustering/KMeans.scala
@@ -329,51 +329,30 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = instrumented { instr =>
     transformSchema(dataset.schema, logging = true)
 
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
-    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
-    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
-    logInfo(s"useKMeansDAL = $useKMeansDAL")
-
-    // will handle persistence only for trainWithML
-    // val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
-    // val w = if (handleWeight) {
-    //   col($(weightCol)).cast(DoubleType)
-    // } else {
-    //   lit(1.0)
-    // }
-
-    // val instances: RDD[(Vector, Double)] = dataset
-    //   .select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w).rdd.map {
-    //   case Row(point: Vector, weight: Double) => (point, weight)
-    // }
-
-    // if (handlePersistence) {
-    //   instances.persist(StorageLevel.MEMORY_AND_DISK)
-    // }
-
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
     instr.logParams(this, featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
       maxIter, seed, tol, weightCol)
 
+    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE)
+    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
     val w = if (handleWeight) {
       col($(weightCol)).cast(DoubleType)
     } else {
       lit(1.0)
     }
-    val instances = dataset.select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w)
-      .rdd.map { case Row(point: Vector, weight: Double) => (point, weight) }
 
-    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
+    val instances: RDD[(Vector, Double)] = dataset
+      .select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w).rdd.map {
+      case Row(point: Vector, weight: Double) => (point, weight)
+    }
+
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
+    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
 
     val model = if (useKMeansDAL) {
-      val offheapEnabled=instances.sparkContext.getConf.getBoolean("spark.memory.offHeap.enabled", false)
-      if (offheapEnabled) {
-        instances.setName("instancesRDD").persist(StorageLevel.OFF_HEAP)
-      } else {
-        instances.setName("instancesRDD").persist(StorageLevel.MEMORY_AND_DISK)
-      }
-      trainWithDAL(instances)
+      trainWithDAL(instances, handlePersistence)
     } else {
       trainWithML(instances, handlePersistence)
     }
@@ -388,13 +367,12 @@ class KMeans @Since("1.5.0") (
 
     model.setSummary(Some(summary))
     instr.logNamedValue("clusterSizes", summary.clusterSizes)
-    // if (handlePersistence) {
-    //   instances.unpersist()
-    // }
+
     model
   }
 
-  private def trainWithDAL(instances: RDD[(Vector, Double)]): KMeansModel = instrumented { instr =>
+  private def trainWithDAL(instances: RDD[(Vector, Double)],
+                           handlePersistence: Boolean): KMeansModel = instrumented { instr =>
 
     val sc = instances.sparkContext
 
@@ -420,12 +398,18 @@ class KMeans @Since("1.5.0") (
     val dataWithNorm = instances.map {
       case (point: Vector, weight: Double) => new VectorWithNorm(point)
     }
+
+    // Cache for init
+    dataWithNorm.persist(StorageLevel.MEMORY_AND_DISK)
+
     val centersWithNorm = if ($(initMode) == "random") {
       mllibKMeans.initRandom(dataWithNorm)
     } else {
       mllibKMeans.initKMeansParallel(dataWithNorm, distanceMeasureInstance)
     }
 
+    dataWithNorm.unpersist()
+
     val centers = centersWithNorm.map(_.vector)
 
     val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
@@ -433,6 +417,10 @@ class KMeans @Since("1.5.0") (
     val strInitMode = $(initMode)
     logInfo(f"Initialization with $strInitMode took $initTimeInSeconds%.3f seconds.")
 
+    if (handlePersistence) {
+      instances.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
     val inputData = instances.map {
       case (point: Vector, weight: Double) => point
     }
@@ -440,32 +428,35 @@ class KMeans @Since("1.5.0") (
     val kmeansDAL = new KMeansDALImpl(getK, getMaxIter, getTol,
       DistanceMeasure.EUCLIDEAN, centers, executor_num, executor_cores)
 
-    val parentModel = kmeansDAL.runWithRDDVector(inputData, Option(instr))
+    val parentModel = kmeansDAL.train(inputData, Option(instr))
 
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
 
-    model
+    if (handlePersistence) {
+      instances.unpersist()
+    }
 
+    model
   }
 
-  private def trainWithML(
-    instances: RDD[(Vector, Double)],
-    handlePersistence: Boolean): KMeansModel = instrumented { instr =>
-      val oldVectorInstances = instances.map {
-        case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
-      }
-      val algo = new MLlibKMeans()
-        .setK($(k))
-        .setInitializationMode($(initMode))
-        .setInitializationSteps($(initSteps))
-        .setMaxIterations($(maxIter))
-        .setSeed($(seed))
-        .setEpsilon($(tol))
-        .setDistanceMeasure($(distanceMeasure))
-      val parentModel = algo.runWithWeight(oldVectorInstances, handlePersistence, Some(instr))
-      val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
-      model
+  private def trainWithML(instances: RDD[(Vector, Double)],
+                          handlePersistence: Boolean): KMeansModel = instrumented { instr =>
+    val oldVectorInstances = instances.map {
+      case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
     }
+    val algo = new MLlibKMeans()
+      .setK($(k))
+      .setInitializationMode($(initMode))
+      .setInitializationSteps($(initSteps))
+      .setMaxIterations($(maxIter))
+      .setSeed($(seed))
+      .setEpsilon($(tol))
+      .setDistanceMeasure($(distanceMeasure))
+    val parentModel = algo.runWithWeight(oldVectorInstances, handlePersistence, Some(instr))
+    val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
+
+    model
+  }
 
   @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/feature/PCA.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/feature/PCA.scala
index 0c9c8ad9e..14e9a2ce1 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/feature/PCA.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/feature/PCA.scala
@@ -96,14 +96,15 @@ class PCA @Since("1.5.0") (
       s"source vector size $numFeatures must be no less than k=$k")
 
     val sc = dataset.sparkSession.sparkContext
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
 
     // Call oneDAL Correlation PCA implementation when numFeatures < 65535 and fall back otherwise
     val parentModel  = if (numFeatures < 65535 && isPlatformSupported) {
       val executor_num = Utils.sparkExecutorNum(dataset.sparkSession.sparkContext)
       val executor_cores = Utils.sparkExecutorCores()
       val pca = new PCADALImpl(k = $(k), executor_num, executor_cores)
-      val pcaModel = pca.fitWithDAL(inputVectors)
+      val pcaModel = pca.train(inputVectors)
       pcaModel
     } else {
       val inputOldVectors = inputVectors.map {
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/recommendation/ALS.scala
index 9196873fb..e59c642c9 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/recommendation/ALS.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.1/ml/recommendation/ALS.scala
@@ -923,7 +923,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
 
     val (userIdAndFactors, itemIdAndFactors) =
       if (implicitPrefs && isPlatformSupported) {
-        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
+        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).train()
       } else {
         trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
           alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/clustering/KMeans.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/clustering/KMeans.scala
index 1aa016af7..54b406f3e 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/clustering/KMeans.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/clustering/KMeans.scala
@@ -329,51 +329,30 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = instrumented { instr =>
     transformSchema(dataset.schema, logging = true)
 
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
-    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
-    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
-    logInfo(s"useKMeansDAL = $useKMeansDAL")
-
-    // will handle persistence only for trainWithML
-    // val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
-    // val w = if (handleWeight) {
-    //   col($(weightCol)).cast(DoubleType)
-    // } else {
-    //   lit(1.0)
-    // }
-
-    // val instances: RDD[(Vector, Double)] = dataset
-    //   .select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w).rdd.map {
-    //   case Row(point: Vector, weight: Double) => (point, weight)
-    // }
-
-    // if (handlePersistence) {
-    //   instances.persist(StorageLevel.MEMORY_AND_DISK)
-    // }
-
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
     instr.logParams(this, featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
       maxIter, seed, tol, weightCol)
 
+    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE)
+    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
     val w = if (handleWeight) {
       col($(weightCol)).cast(DoubleType)
     } else {
       lit(1.0)
     }
-    val instances = dataset.select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w)
-      .rdd.map { case Row(point: Vector, weight: Double) => (point, weight) }
 
-    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
+    val instances: RDD[(Vector, Double)] = dataset
+      .select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w).rdd.map {
+      case Row(point: Vector, weight: Double) => (point, weight)
+    }
+
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
+    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
 
     val model = if (useKMeansDAL) {
-      val offheapEnabled=instances.sparkContext.getConf.getBoolean("spark.memory.offHeap.enabled", false)
-      if (offheapEnabled) {
-        instances.setName("instancesRDD").persist(StorageLevel.OFF_HEAP)
-      } else {
-        instances.setName("instancesRDD").persist(StorageLevel.MEMORY_AND_DISK)
-      }
-      trainWithDAL(instances)
+      trainWithDAL(instances, handlePersistence)
     } else {
       trainWithML(instances, handlePersistence)
     }
@@ -388,13 +367,12 @@ class KMeans @Since("1.5.0") (
 
     model.setSummary(Some(summary))
     instr.logNamedValue("clusterSizes", summary.clusterSizes)
-    // if (handlePersistence) {
-    //   instances.unpersist()
-    // }
+
     model
   }
 
-  private def trainWithDAL(instances: RDD[(Vector, Double)]): KMeansModel = instrumented { instr =>
+  private def trainWithDAL(instances: RDD[(Vector, Double)],
+                           handlePersistence: Boolean): KMeansModel = instrumented { instr =>
 
     val sc = instances.sparkContext
 
@@ -420,12 +398,18 @@ class KMeans @Since("1.5.0") (
     val dataWithNorm = instances.map {
       case (point: Vector, weight: Double) => new VectorWithNorm(point)
     }
+
+    // Cache for init
+    dataWithNorm.persist(StorageLevel.MEMORY_AND_DISK)
+
     val centersWithNorm = if ($(initMode) == "random") {
       mllibKMeans.initRandom(dataWithNorm)
     } else {
       mllibKMeans.initKMeansParallel(dataWithNorm, distanceMeasureInstance)
     }
 
+    dataWithNorm.unpersist()
+
     val centers = centersWithNorm.map(_.vector)
 
     val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
@@ -433,6 +417,10 @@ class KMeans @Since("1.5.0") (
     val strInitMode = $(initMode)
     logInfo(f"Initialization with $strInitMode took $initTimeInSeconds%.3f seconds.")
 
+    if (handlePersistence) {
+      instances.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
     val inputData = instances.map {
       case (point: Vector, weight: Double) => point
     }
@@ -440,17 +428,19 @@ class KMeans @Since("1.5.0") (
     val kmeansDAL = new KMeansDALImpl(getK, getMaxIter, getTol,
       DistanceMeasure.EUCLIDEAN, centers, executor_num, executor_cores)
 
-    val parentModel = kmeansDAL.runWithRDDVector(inputData, Option(instr))
+    val parentModel = kmeansDAL.train(inputData, Option(instr))
 
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
 
-    model
+    if (handlePersistence) {
+      instances.unpersist()
+    }
 
+    model
   }
 
-  private def trainWithML(
-    instances: RDD[(Vector, Double)],
-    handlePersistence: Boolean): KMeansModel = instrumented { instr =>
+  private def trainWithML(instances: RDD[(Vector, Double)],
+                          handlePersistence: Boolean): KMeansModel = instrumented { instr =>
       val oldVectorInstances = instances.map {
         case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
       }
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/feature/PCA.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/feature/PCA.scala
index 0c9c8ad9e..14e9a2ce1 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/feature/PCA.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/feature/PCA.scala
@@ -96,14 +96,15 @@ class PCA @Since("1.5.0") (
       s"source vector size $numFeatures must be no less than k=$k")
 
     val sc = dataset.sparkSession.sparkContext
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
 
     // Call oneDAL Correlation PCA implementation when numFeatures < 65535 and fall back otherwise
     val parentModel  = if (numFeatures < 65535 && isPlatformSupported) {
       val executor_num = Utils.sparkExecutorNum(dataset.sparkSession.sparkContext)
       val executor_cores = Utils.sparkExecutorCores()
       val pca = new PCADALImpl(k = $(k), executor_num, executor_cores)
-      val pcaModel = pca.fitWithDAL(inputVectors)
+      val pcaModel = pca.train(inputVectors)
       pcaModel
     } else {
       val inputOldVectors = inputVectors.map {
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/recommendation/ALS.scala
index 9196873fb..e59c642c9 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/recommendation/ALS.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.0.2/ml/recommendation/ALS.scala
@@ -923,7 +923,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
 
     val (userIdAndFactors, itemIdAndFactors) =
       if (implicitPrefs && isPlatformSupported) {
-        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
+        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).train()
       } else {
         trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
           alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/clustering/KMeans.scala b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/clustering/KMeans.scala
index a3c8b8568..0878c146d 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/clustering/KMeans.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/clustering/KMeans.scala
@@ -330,33 +330,12 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = instrumented { instr =>
     transformSchema(dataset.schema, logging = true)
 
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
-    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
-    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
-    logInfo(s"useKMeansDAL = $useKMeansDAL")
-
-    // will handle persistence only for trainWithML
-    // val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
-    // val w = if (handleWeight) {
-    //   col($(weightCol)).cast(DoubleType)
-    // } else {
-    //   lit(1.0)
-    // }
-
-    // val instances: RDD[(Vector, Double)] = dataset
-    //   .select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w).rdd.map {
-    //   case Row(point: Vector, weight: Double) => (point, weight)
-    // }
-
-    // if (handlePersistence) {
-    //   instances.persist(StorageLevel.MEMORY_AND_DISK)
-    // }
-
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
     instr.logParams(this, featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
       maxIter, seed, tol, weightCol)
 
+    val handleWeight = isDefined(weightCol) && $(weightCol).nonEmpty
     val w = if (handleWeight) {
       checkNonNegativeWeight(col($(weightCol)).cast(DoubleType))
     } else {
@@ -365,16 +344,14 @@ class KMeans @Since("1.5.0") (
     val instances = dataset.select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w)
       .rdd.map { case Row(point: Vector, weight: Double) => (point, weight) }
 
-    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE && !useKMeansDAL)
+    val handlePersistence = (dataset.storageLevel == StorageLevel.NONE)
+
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
+    val useKMeansDAL = isPlatformSupported && $(distanceMeasure) == "euclidean" && !handleWeight
 
     val model = if (useKMeansDAL) {
-      val offheapEnabled=instances.sparkContext.getConf.getBoolean("spark.memory.offHeap.enabled", false)
-      if (offheapEnabled) {
-        instances.setName("instancesRDD").persist(StorageLevel.OFF_HEAP)
-      } else {
-        instances.setName("instancesRDD").persist(StorageLevel.MEMORY_AND_DISK)
-      }
-      trainWithDAL(instances)
+      trainWithDAL(instances, handlePersistence)
     } else {
       trainWithML(instances, handlePersistence)
     }
@@ -389,13 +366,12 @@ class KMeans @Since("1.5.0") (
 
     model.setSummary(Some(summary))
     instr.logNamedValue("clusterSizes", summary.clusterSizes)
-    // if (handlePersistence) {
-    //   instances.unpersist()
-    // }
+
     model
   }
 
-  private def trainWithDAL(instances: RDD[(Vector, Double)]): KMeansModel = instrumented { instr =>
+  private def trainWithDAL(instances: RDD[(Vector, Double)],
+                           handlePersistence: Boolean): KMeansModel = instrumented { instr =>
 
     val sc = instances.sparkContext
 
@@ -421,12 +397,18 @@ class KMeans @Since("1.5.0") (
     val dataWithNorm = instances.map {
       case (point: Vector, weight: Double) => new VectorWithNorm(point)
     }
+
+    // Cache for init
+    dataWithNorm.persist(StorageLevel.MEMORY_AND_DISK)
+
     val centersWithNorm = if ($(initMode) == "random") {
       mllibKMeans.initRandom(dataWithNorm)
     } else {
       mllibKMeans.initKMeansParallel(dataWithNorm, distanceMeasureInstance)
     }
 
+    dataWithNorm.unpersist()
+
     val centers = centersWithNorm.map(_.vector)
 
     val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
@@ -434,6 +416,10 @@ class KMeans @Since("1.5.0") (
     val strInitMode = $(initMode)
     logInfo(f"Initialization with $strInitMode took $initTimeInSeconds%.3f seconds.")
 
+    if (handlePersistence) {
+      instances.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
     val inputData = instances.map {
       case (point: Vector, weight: Double) => point
     }
@@ -441,17 +427,19 @@ class KMeans @Since("1.5.0") (
     val kmeansDAL = new KMeansDALImpl(getK, getMaxIter, getTol,
       DistanceMeasure.EUCLIDEAN, centers, executor_num, executor_cores)
 
-    val parentModel = kmeansDAL.runWithRDDVector(inputData, Option(instr))
+    val parentModel = kmeansDAL.train(inputData, Option(instr))
 
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
 
-    model
+    if (handlePersistence) {
+      instances.unpersist()
+    }
 
+    model
   }
 
-  private def trainWithML(
-    instances: RDD[(Vector, Double)],
-    handlePersistence: Boolean): KMeansModel = instrumented { instr =>
+  private def trainWithML(instances: RDD[(Vector, Double)],
+                          handlePersistence: Boolean): KMeansModel = instrumented { instr =>
       val oldVectorInstances = instances.map {
         case (point: Vector, weight: Double) => (OldVectors.fromML(point), weight)
       }
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/feature/PCA.scala b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/feature/PCA.scala
index 0c9c8ad9e..14e9a2ce1 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/feature/PCA.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/feature/PCA.scala
@@ -96,14 +96,15 @@ class PCA @Since("1.5.0") (
       s"source vector size $numFeatures must be no less than k=$k")
 
     val sc = dataset.sparkSession.sparkContext
-    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(dataset.sparkSession.sparkContext)
+    val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
+      dataset.sparkSession.sparkContext)
 
     // Call oneDAL Correlation PCA implementation when numFeatures < 65535 and fall back otherwise
     val parentModel  = if (numFeatures < 65535 && isPlatformSupported) {
       val executor_num = Utils.sparkExecutorNum(dataset.sparkSession.sparkContext)
       val executor_cores = Utils.sparkExecutorCores()
       val pca = new PCADALImpl(k = $(k), executor_num, executor_cores)
-      val pcaModel = pca.fitWithDAL(inputVectors)
+      val pcaModel = pca.train(inputVectors)
       pcaModel
     } else {
       val inputOldVectors = inputVectors.map {
diff --git a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/recommendation/ALS.scala
index 9196873fb..e59c642c9 100644
--- a/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/recommendation/ALS.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark-3.1.1/ml/recommendation/ALS.scala
@@ -923,7 +923,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
 
     val (userIdAndFactors, itemIdAndFactors) =
       if (implicitPrefs && isPlatformSupported) {
-        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
+        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).train()
       } else {
         trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
           alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
index e9e7ec36d..f2d0bbe5e 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
@@ -1,12 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,104 +16,34 @@
 
 package org.apache.spark.ml.clustering
 
-import com.intel.daal.data_management.data.{NumericTable, RowMergedNumericTable, Matrix => DALMatrix}
-import com.intel.daal.services.DaalContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.clustering.{KMeansModel => MLlibKMeansModel}
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
-import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD}
-
-class KMeansDALImpl (
-  var nClusters : Int,
-  var maxIterations : Int,
-  var tolerance : Double,
-  val distanceMeasure: String,
-  val centers: Array[OldVector],
-  val executorNum: Int,
-  val executorCores: Int
-) extends Serializable with Logging {
-
-  def runWithRDDVector(data: RDD[Vector], instr: Option[Instrumentation]) : MLlibKMeansModel = {
-
-    instr.foreach(_.logInfo(s"Processing partitions with $executorNum executors"))
-
-    // repartition to executorNum if not enough partitions
-    val dataForConversion = if (data.getNumPartitions < executorNum) {
-      data.repartition(executorNum).setName("Repartitioned for conversion").cache()
-    } else {
-      data
-    }
+import org.apache.spark.rdd.RDD
 
-    val executorIPAddress = Utils.sparkFirstExecutorIP(dataForConversion.sparkContext)
-    val kvsIP = dataForConversion.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
-    val kvsPortDetected = Utils.checkExecutorAvailPort(dataForConversion, kvsIP)
-    val kvsPort = dataForConversion.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+class KMeansDALImpl(var nClusters: Int,
+                    var maxIterations: Int,
+                    var tolerance: Double,
+                    val distanceMeasure: String,
+                    val centers: Array[OldVector],
+                    val executorNum: Int,
+                    val executorCores: Int
+                   ) extends Serializable with Logging {
 
-    val kvsIPPort = kvsIP+"_"+kvsPort
+  def train(data: RDD[Vector], instr: Option[Instrumentation]): MLlibKMeansModel = {
 
-    val partitionDims = Utils.getPartitionDims(dataForConversion)
+    val coalescedTables = OneDAL.vectorsToMergedNumericTables(data, executorNum)
 
-    // filter the empty partitions
-    val partRows = dataForConversion.mapPartitionsWithIndex { (index: Int, it: Iterator[Vector]) =>
-      Iterator(Tuple3(partitionDims(index)._1, index, it))
-    }
-    val nonEmptyPart = partRows.filter{entry => { entry._1 > 0 }}
-    
-    // convert RDD[Vector] to RDD[HomogenNumericTable]
-    val numericTables = nonEmptyPart.map { entry =>
-      val numRows = entry._1
-      val index = entry._2
-      val it = entry._3
-      val numCols = partitionDims(index)._2
-	  
-      logDebug(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows")
-
-      // Build DALMatrix, this will load libJavaAPI, libtbb, libtbbmalloc
-      val context = new DaalContext()
-      val matrix = new DALMatrix(context, classOf[java.lang.Double],
-        numCols.toLong, numRows.toLong, NumericTable.AllocationFlag.DoAllocate)
-
-      logDebug("KMeansDALImpl: Loading native libraries" )
-      // oneDAL libs should be loaded by now, extract libMLlibDAL.so to temp file and load
-      LibLoader.loadLibraries()
-
-      import scala.collection.JavaConverters._
-
-      var dalRow = 0
-         
-      it.foreach { curVector =>
-        val rowArr = curVector.toArray
-        OneDAL.cSetDoubleBatch(matrix.getCNumericTable, dalRow, rowArr, 1, numCols)
-        dalRow += 1
-      }
+    val executorIPAddress = Utils.sparkFirstExecutorIP(coalescedTables.sparkContext)
+    val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip",
+      executorIPAddress)
+    val kvsPortDetected = Utils.checkExecutorAvailPort(coalescedTables, kvsIP)
+    val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port",
+      kvsPortDetected)
 
-      Iterator(matrix.getCNumericTable)
-
-    }.cache()
-    
-	  // workaround to fix the bug of multi executors handling same partition.
-    numericTables.foreachPartition(() => _)
-    numericTables.count()
-
-    val cachedRdds = data.sparkContext.getPersistentRDDs
-    cachedRdds.filter(r => r._2.name=="instancesRDD").foreach (r => r._2.unpersist())
-	
-    val coalescedRdd = numericTables.coalesce(1,
-      partitionCoalescer = Some(new ExecutorInProcessCoalescePartitioner()))
-
-    val coalescedTables = coalescedRdd.mapPartitions { iter =>
-      val context = new DaalContext()
-      val mergedData = new RowMergedNumericTable(context)
-	  
-      iter.foreach{ curIter =>
-        val address = curIter.next()
-        OneDAL.cAddNumericTable(mergedData.getCNumericTable, address )
-      } 
-      Iterator(mergedData.getCNumericTable)
-    
-    }.cache()
+    val kvsIPPort = kvsIP + "_" + kvsPort
 
     val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
       val tableArr = table.next()
@@ -146,16 +75,12 @@ class KMeansDALImpl (
       ret
     }.collect()
 
-    // Release the native memory allocated by NumericTable.
-    numericTables.foreach( tables =>
-      tables.foreach { address =>
-        OneDAL.cFreeDataMemory(address)
-      }
-    )
-
     // Make sure there is only one result from rank 0
     assert(results.length == 1)
 
+    // Release native memory for numeric tables
+    OneDAL.releaseNumericTables(data.sparkContext)
+
     val centerVectors = results(0)._1
     val totalCost = results(0)._2
     val iterationNum = results(0)._3
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
index e1bba3d37..f2b8645a2 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
@@ -1,12 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -18,42 +17,36 @@
 package org.apache.spark.ml.feature
 
 import java.util.Arrays
+
 import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.util.{OneCCL, OneDAL, Utils}
-import org.apache.spark.mllib.feature.{PCAModel => MLlibPCAModel}
+import org.apache.spark.mllib.feature.{PCAModel => MLlibPCAModel, StandardScaler => MLlibStandardScaler}
 import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Vectors => OldVectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.feature.{StandardScaler => MLlibStandardScaler}
 
-class PCADALImpl (
-    val k: Int,
-    val executorNum: Int,
-    val executorCores: Int)
+class PCADALImpl(val k: Int,
+                 val executorNum: Int,
+                 val executorCores: Int)
   extends Serializable with Logging {
 
-  // Normalize data before apply fitWithDAL
-  private def normalizeData(input: RDD[Vector]) : RDD[Vector] = {
-    val vectors = input.map(OldVectors.fromML(_))
-    val scaler = new MLlibStandardScaler(withMean = true, withStd = false).fit(vectors)
-    val res = scaler.transform(vectors)
-    res.map(_.asML)
-  }
-
-  def fitWithDAL(data: RDD[Vector]) : MLlibPCAModel = {
+  def train(data: RDD[Vector]): MLlibPCAModel = {
 
     val normalizedData = normalizeData(data)
 
-    val coalescedTables = OneDAL.rddVectorToNumericTables(normalizedData, executorNum)
+    val coalescedTables = OneDAL.vectorsToMergedNumericTables(normalizedData, executorNum)
 
     val executorIPAddress = Utils.sparkFirstExecutorIP(coalescedTables.sparkContext)
-    val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
+    val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip",
+      executorIPAddress)
 
     val kvsPortDetected = Utils.checkExecutorAvailPort(coalescedTables, kvsIP)
-    val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+    val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port",
+      kvsPortDetected)
 
-    val kvsIPPort = kvsIP+"_"+kvsPort
+    val kvsIPPort = kvsIP + "_" + kvsPort
 
     val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
       val tableArr = table.next()
@@ -71,7 +64,8 @@ class PCADALImpl (
       val ret = if (OneCCL.isRoot()) {
 
         val pcNumericTable = OneDAL.makeNumericTable(result.pcNumericTable)
-        val explainedVarianceNumericTable = OneDAL.makeNumericTable(result.explainedVarianceNumericTable)
+        val explainedVarianceNumericTable = OneDAL.makeNumericTable(
+          result.explainedVarianceNumericTable)
 
         val principleComponents = getPrincipleComponentsFromDAL(pcNumericTable, k)
         val explainedVariance = getExplainedVarianceFromDAL(explainedVarianceNumericTable, k)
@@ -89,6 +83,9 @@ class PCADALImpl (
     // Make sure there is only one result from rank 0
     assert(results.length == 1)
 
+    // Release native memory for numeric tables
+    OneDAL.releaseNumericTables(data.sparkContext)
+
     val pc = results(0)._1
     val explainedVariance = results(0)._2
 
@@ -100,6 +97,14 @@ class PCADALImpl (
     parentModel
   }
 
+  // Normalize data before training
+  private def normalizeData(input: RDD[Vector]): RDD[Vector] = {
+    val vectors = input.map(OldVectors.fromML(_))
+    val scaler = new MLlibStandardScaler(withMean = true, withStd = false).fit(vectors)
+    val res = scaler.transform(vectors)
+    res.map(_.asML)
+  }
+
   private def getPrincipleComponentsFromDAL(table: NumericTable, k: Int): DenseMatrix = {
     val data = table.asInstanceOf[HomogenNumericTable].getDoubleArray()
 
@@ -124,7 +129,7 @@ class PCADALImpl (
     val data = table_1xn.asInstanceOf[HomogenNumericTable].getDoubleArray()
     val sum = data.sum
     val topK = Arrays.copyOfRange(data, 0, k)
-    for ( i <- 0 until k )
+    for (i <- 0 until k)
       topK(i) = topK(i) / sum
     new DenseVector(topK)
   }
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
index bcb95ca1f..1e16c97c3 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
@@ -1,185 +1,61 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.recommendation
 
-import com.intel.daal.data_management.data.CSRNumericTable.Indexing
-import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD}
+import java.nio.{ByteBuffer, ByteOrder, FloatBuffer}
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
-import com.intel.daal.data_management.data.{CSRNumericTable, HomogenNumericTable, RowMergedNumericTable, Matrix => DALMatrix}
+
+import com.intel.daal.data_management.data.CSRNumericTable
 import com.intel.daal.services.DaalContext
+
 import org.apache.spark.Partitioner
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.recommendation.ALS.Rating
 import org.apache.spark.ml.util._
-
-import java.nio.{ByteBuffer, ByteOrder}
-import scala.collection.mutable.ArrayBuffer
-//import java.nio.DoubleBuffer
-import java.nio.FloatBuffer
+import org.apache.spark.rdd.RDD
 
 class ALSDataPartitioner(blocks: Int, itemsInBlock: Long)
   extends Partitioner {
   def numPartitions: Int = blocks
+
   def getPartition(key: Any): Int = {
     val k = key.asInstanceOf[Long]
     // itemsInBlock = numItems / partitions
     // remaining records will belog to the last partition
     // 21 => 5, 5, 5, 6
     // 46 => 11, 11, 11, 13
-    math.min((k / itemsInBlock).toInt, blocks-1)
+    math.min((k / itemsInBlock).toInt, blocks - 1)
   }
 }
 
-class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
-  data: RDD[Rating[ID]],
-  nFactors: Int,
-  maxIter: Int,
-  regParam: Double,
-  alpha: Double,
-  seed: Long,
-) extends Serializable with Logging {
+class ALSDALImpl[@specialized(Int, Long) ID: ClassTag]( data: RDD[Rating[ID]],
+                                                        nFactors: Int,
+                                                        maxIter: Int,
+                                                        regParam: Double,
+                                                        alpha: Double,
+                                                        seed: Long
+                                                      ) extends Serializable with Logging {
 
   // Rating struct size is size of Long+Long+Float
   val RATING_SIZE = 8 + 8 + 4
 
-  // Return Map partitionId -> (ratingsNum, csrRowNum, rowOffset)
-  private def getRatingsPartitionInfo(data: RDD[Rating[ID]]): Map[Int, (Int, Int, Int)] = {
-    val collectd = data.mapPartitionsWithIndex { case (index: Int, it: Iterator[Rating[ID]]) =>
-      var ratingsNum = 0
-      var s = Set[ID]()
-      it.foreach { v =>
-        s += v.user
-        ratingsNum += 1
-      }
-      Iterator((index, (ratingsNum, s.count(_ => true))))
-    }.collect
-
-    var ret = Map[Int, (Int, Int, Int)]()
-    var rowOffset = 0
-    collectd.foreach { v =>
-      val partitionId = v._1
-      val ratingsNum = v._2._1
-      val csrRowNum = v._2._2
-      ret += ( partitionId -> (ratingsNum, csrRowNum, rowOffset))
-      rowOffset = rowOffset + csrRowNum
-    }
-
-    ret
-  }
-
-  private def ratingsToCSRNumericTables(ratings: RDD[Rating[ID]],
-    nVectors: Long, nFeatures: Long, nBlocks: Long): RDD[CSRNumericTable] = {
-
-//    val rowSortedRatings = ratings.sortBy(_.user.toString.toLong)
-
-//    val itemsInBlock = (nFeatures + nBlocks - 1) / nBlocks
-    val itemsInBlock = nFeatures / nBlocks
-//    val rowSortedGrouped = rowSortedRatings.groupBy(value => value.user.toString.toLong / itemsInBlock).flatMap(_._2)
-    val rowSortedGrouped = ratings
-      // Transpose the dataset
-      .map { p =>
-        Rating(p.item, p.user, p.rating)
-      }
-      .groupBy(value => value.user.toString.toLong)
-      .partitionBy(new ALSDataPartitioner(nBlocks.toInt, itemsInBlock))
-      .flatMap(_._2).mapPartitions { p =>
-        p.toArray.sortBy(_.user.toString.toLong).toIterator
-      }
-
-    println("rowSortedGrouped partition number: ",  rowSortedGrouped.getNumPartitions)
-
-    //    rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
-//        println("partitionId", partitionId)
-//        partition.foreach { p =>
-//          println(p.user, p.item, p.rating) }
-//        Iterator(partitionId)
-//    }.collect()
-
-    val ratingsPartitionInfo = getRatingsPartitionInfo(rowSortedGrouped)
-    println("ratingsPartitionInfo:",  ratingsPartitionInfo)
-
-    rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
-      val ratingsNum = ratingsPartitionInfo(partitionId)._1
-      val csrRowNum = ratingsPartitionInfo(partitionId)._2
-      val values = Array.fill(ratingsNum) { 0.0f }
-      val columnIndices = Array.fill(ratingsNum) { 0L }
-      val rowOffsets = ArrayBuffer[Long](1L)
-
-
-      var index = 0
-      var curRow = 0L
-      // Each partition converted to one CSRNumericTable
-      partition.foreach { p =>
-        // Modify row index for each partition (start from 0)
-        val row = p.user.toString.toLong - ratingsPartitionInfo(partitionId)._3
-        val column = p.item.toString.toLong
-        val rating = p.rating
-
-        values(index) = rating
-        // one-based index
-        columnIndices(index) = column + 1
-
-        if (row > curRow) {
-          curRow = row
-          // one-based index
-          rowOffsets += index + 1
-        }
-
-        index = index + 1
-      }
-      // one-based row index
-      rowOffsets += index+1
-
-      println("PartitionId:", partitionId)
-      println("csrRowNum", csrRowNum)
-//      println("rowOffsets", rowOffsets.mkString(","))
-//      println("columnIndices", columnIndices.mkString(","))
-//      println("values", values.mkString(","))
-
-      val contextLocal = new DaalContext()
-
-      println("ALSDALImpl: Loading native libraries ..." )
-      LibLoader.loadLibraries()
-
-      val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
-      val table = new CSRNumericTable(contextLocal, cTable)
-//      table.pack()
-
-      println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
-
-      // There is a bug https://github.com/oneapi-src/oneDAL/pull/1288,
-      // printNumericTable can't print correct result for CSRNumericTable, use C++ printNumericTable
-      // Service.printNumericTable("Input: ", table)
-
-      Iterator(table)
-    }.cache()
-  }
-
-//  def factorsToRDD(cUsersFactorsNumTab: Long, cItemsFactorsNumTab: Long)
-//    :(RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-//    val usersFactorsNumTab = OneDAL.makeNumericTable(cUsersFactorsNumTab)
-//    val itemsFactorsNumTab = OneDAL.makeNumericTable(cItemsFactorsNumTab)
-//
-//    Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
-//    Service.printNumericTable("itemsFactorsNumTab", itemsFactorsNumTab)
-//
-//    null
-//  }
-
-  def ratingsToByteBuffer(ratings: Array[Rating[ID]]): ByteBuffer = {
-//    println("ratings len", ratings.length)
-
-    val buffer= ByteBuffer.allocateDirect(ratings.length*(8+8+4))
-    // Use little endian
-    buffer.order(ByteOrder.LITTLE_ENDIAN)
-    ratings.foreach { rating =>
-      buffer.putLong(rating.user.toString.toLong)
-      buffer.putLong(rating.item.toString.toLong)
-      buffer.putFloat(rating.rating)
-    }
-    buffer
-  }
-
-  def run(): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+  def train(): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
     val executorNum = Utils.sparkExecutorNum(data.sparkContext)
     val executorCores = Utils.sparkExecutorCores()
 
@@ -193,35 +69,32 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
         Ordering[Long].compare(x.user.toString.toLong, y.user.toString.toLong)
     }).user.toString.toLong + 1
 
-//    val largestItems = data.sortBy(_.item.toString.toLong, ascending = false).take(1)
-//    val nFeatures = largestItems(0).item.toString.toLong + 1
-
-//    val largestUsers = data.sortBy(_.user.toString.toLong, ascending = false).take(1)
-//    val nVectors = largestUsers(0).user.toString.toLong + 1
-
     val nBlocks = executorNum
 
-//    val nRatings = data.count()
-
-    logInfo(s"ALSDAL fit using $executorNum Executors for $nVectors vectors and $nFeatures features")
+    logInfo(s"ALSDAL fit using $executorNum Executors " +
+      s"for $nVectors vectors and $nFeatures features")
 
-    val numericTables = data.repartition(executorNum).setName("Repartitioned for conversion").cache()
+    val numericTables = data.repartition(executorNum)
+      .setName("Repartitioned for conversion").cache()
 
     val executorIPAddress = Utils.sparkFirstExecutorIP(numericTables.sparkContext)
-    val kvsIP = numericTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
+    val kvsIP = numericTables.sparkContext.conf.get(
+      "spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
 
     val kvsPortDetected = Utils.checkExecutorAvailPort(numericTables, kvsIP)
-    val kvsPort = numericTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+    val kvsPort = numericTables.sparkContext.conf.getInt(
+      "spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
 
-    val kvsIPPort = kvsIP+"_"+kvsPort
+    val kvsIPPort = kvsIP + "_" + kvsPort
 
     val results = numericTables
       // Transpose the dataset
       .map { p =>
-        Rating(p.item, p.user, p.rating) }
+        Rating(p.item, p.user, p.rating)
+      }
       .mapPartitionsWithIndex { (rank, iter) =>
         val context = new DaalContext()
-        println("ALSDALImpl: Loading libMLlibDAL.so" )
+        println("ALSDALImpl: Loading libMLlibDAL.so")
         LibLoader.loadLibraries()
 
         OneCCL.init(executorNum, rank, kvsIPPort)
@@ -233,7 +106,8 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
         val bufferInfo = new ALSPartitionInfo
         val shuffledBuffer = cShuffleData(buffer, nFeatures.toInt, nBlocks, bufferInfo)
 
-        val table = bufferToCSRNumericTable(shuffledBuffer, bufferInfo, nVectors.toInt, nFeatures.toInt, nBlocks, rankId)
+        val table = bufferToCSRNumericTable(shuffledBuffer, bufferInfo,
+          nVectors.toInt, nFeatures.toInt, nBlocks, rankId)
 
         val result = new ALSResult()
         cDALImplictALS(
@@ -245,87 +119,82 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
           result
         )
         Iterator(result)
-    }.cache()
-
-//    results.foreach { p =>
-////      val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-////      println("foreach", p.cUsersFactorsNumTab, p.cItemsFactorsNumTab)
-//      println("result", p.rankId, p.cUserOffset, p.cItemOffset);
-//    }
-
-//    val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-//      partiton.foreach { p =>
-//        val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-//        Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
-//      }
-//      Iterator()
-//    }.collect()
-
-    val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-      val ret = partiton.flatMap { p =>
-        val userOffset = p.cUserOffset.toInt
-        val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-        val nRows = usersFactorsNumTab.getNumberOfRows.toInt
-        val nCols = usersFactorsNumTab.getNumberOfColumns.toInt
-        var buffer = FloatBuffer.allocate(nCols * nRows)
-        // should use returned buffer
-        buffer = usersFactorsNumTab.getBlockOfRows(0, nRows, buffer)
-        (0 until nRows).map { index =>
-          val array = Array.fill(nCols){0.0f}
-          buffer.get(array, 0, nCols)
-          ((index+userOffset).asInstanceOf[ID], array)
-        }.toIterator
-      }
-      ret
-    }.setName("userFactors").cache()
-
-    val itemsFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-      val ret = partiton.flatMap { p =>
-        val itemOffset = p.cItemOffset.toInt
-        val itemsFactorsNumTab = OneDAL.makeNumericTable(p.cItemsFactorsNumTab)
-        val nRows = itemsFactorsNumTab.getNumberOfRows.toInt
-        val nCols = itemsFactorsNumTab.getNumberOfColumns.toInt
-        var buffer = FloatBuffer.allocate(nCols * nRows)
-        // should use returned buffer
-        buffer = itemsFactorsNumTab.getBlockOfRows(0, nRows, buffer)
-        (0 until nRows).map { index =>
-          val array = Array.fill(nCols){0.0f}
-          buffer.get(array, 0, nCols)
-          ((index+itemOffset).asInstanceOf[ID], array)
-        }.toIterator
-      }
-      ret
-    }.setName("itemFactors").cache()
+      }.cache()
+
+    val usersFactorsRDD = results
+      .mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
+        val ret = partiton.flatMap { p =>
+          val userOffset = p.cUserOffset.toInt
+          val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
+          val nRows = usersFactorsNumTab.getNumberOfRows.toInt
+          val nCols = usersFactorsNumTab.getNumberOfColumns.toInt
+          var buffer = FloatBuffer.allocate(nCols * nRows)
+          // should use returned buffer
+          buffer = usersFactorsNumTab.getBlockOfRows(0, nRows, buffer)
+          (0 until nRows).map { index =>
+            val array = Array.fill(nCols) {
+              0.0f
+            }
+            buffer.get(array, 0, nCols)
+            ((index + userOffset).asInstanceOf[ID], array)
+          }.toIterator
+        }
+        ret
+      }.setName("userFactors").cache()
+
+    val itemsFactorsRDD = results
+      .mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
+        val ret = partiton.flatMap { p =>
+          val itemOffset = p.cItemOffset.toInt
+          val itemsFactorsNumTab = OneDAL.makeNumericTable(p.cItemsFactorsNumTab)
+          val nRows = itemsFactorsNumTab.getNumberOfRows.toInt
+          val nCols = itemsFactorsNumTab.getNumberOfColumns.toInt
+          var buffer = FloatBuffer.allocate(nCols * nRows)
+          // should use returned buffer
+          buffer = itemsFactorsNumTab.getBlockOfRows(0, nRows, buffer)
+          (0 until nRows).map { index =>
+            val array = Array.fill(nCols) {
+              0.0f
+            }
+            buffer.get(array, 0, nCols)
+            ((index + itemOffset).asInstanceOf[ID], array)
+          }.toIterator
+        }
+        ret
+      }.setName("itemFactors").cache()
 
     usersFactorsRDD.count()
     itemsFactorsRDD.count()
 
-//    usersFactorsRDD.foreach { case (id, array) =>
-//        println("usersFactorsRDD", id, array.mkString(", "))
-//    }
-//
-//    itemsFactorsRDD.foreach { case (id, array) =>
-//      println("itemsFactorsRDD", id, array.mkString(", "))
-//    }
-
     (usersFactorsRDD, itemsFactorsRDD)
   }
 
-  private def getPartitionOffset(partitionId: Int, nRatings: Int, nBlocks: Int): Int = {
-    require(partitionId >=0 && partitionId < nBlocks)
-    val itemsInBlock = nRatings / nBlocks
-    return partitionId * itemsInBlock
+  def ratingsToByteBuffer(ratings: Array[Rating[ID]]): ByteBuffer = {
+    val buffer = ByteBuffer.allocateDirect(ratings.length * (8 + 8 + 4))
+    // Use little endian
+    buffer.order(ByteOrder.LITTLE_ENDIAN)
+    ratings.foreach { rating =>
+      buffer.putLong(rating.user.toString.toLong)
+      buffer.putLong(rating.item.toString.toLong)
+      buffer.putFloat(rating.rating)
+    }
+    buffer
   }
 
   private def bufferToCSRNumericTable(buffer: ByteBuffer, info: ALSPartitionInfo,
-                                      nVectors: Int, nFeatures: Int, nBlocks: Int, rankId: Int): CSRNumericTable = {
+                                      nVectors: Int, nFeatures: Int,
+                                      nBlocks: Int, rankId: Int): CSRNumericTable = {
     // Use little endian
     buffer.order(ByteOrder.LITTLE_ENDIAN)
 
     val ratingsNum = info.ratingsNum
     val csrRowNum = info.csrRowNum
-    val values = Array.fill(ratingsNum) { 0.0f }
-    val columnIndices = Array.fill(ratingsNum) { 0L }
+    val values = Array.fill(ratingsNum) {
+      0.0f
+    }
+    val columnIndices = Array.fill(ratingsNum) {
+      0L
+    }
     val rowOffsets = ArrayBuffer[Long](1L)
 
     var index = 0
@@ -333,9 +202,9 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
     // Each partition converted to one CSRNumericTable
     for (i <- 0 until ratingsNum) {
       // Modify row index for each partition (start from 0)
-      val row = buffer.getLong(i*RATING_SIZE) - getPartitionOffset(rankId, nFeatures, nBlocks)
-      val column = buffer.getLong(i*RATING_SIZE+8)
-      val rating = buffer.getFloat(i*RATING_SIZE+16)
+      val row = buffer.getLong(i * RATING_SIZE) - getPartitionOffset(rankId, nFeatures, nBlocks)
+      val column = buffer.getLong(i * RATING_SIZE + 8)
+      val rating = buffer.getFloat(i * RATING_SIZE + 16)
 
       values(index) = rating
       // one-based index
@@ -350,27 +219,49 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
       index = index + 1
     }
     // one-based row index
-    rowOffsets += index+1
-
-//    println("rankId:", rankId)
-//    println("csrRowNum", csrRowNum)
-
-//    println(rowOffsets.mkString(" "))
-//    println(columnIndices.mkString(" "))
-//    println(values.mkString(" "))
+    rowOffsets += index + 1
 
     val contextLocal = new DaalContext()
-    val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
+    val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray,
+      nVectors, csrRowNum)
     val table = new CSRNumericTable(contextLocal, cTable)
 
-    println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
-//    Service.printNumericTable("Input NumericTable", table)
-
     table
   }
 
+  private def getPartitionOffset(partitionId: Int, nRatings: Int, nBlocks: Int): Int = {
+    require(partitionId >= 0 && partitionId < nBlocks)
+    val itemsInBlock = nRatings / nBlocks
+    return partitionId * itemsInBlock
+  }
+
+  // Return Map partitionId -> (ratingsNum, csrRowNum, rowOffset)
+  private def getRatingsPartitionInfo(data: RDD[Rating[ID]]): Map[Int, (Int, Int, Int)] = {
+    val collectd = data.mapPartitionsWithIndex { case (index: Int, it: Iterator[Rating[ID]]) =>
+      var ratingsNum = 0
+      var s = Set[ID]()
+      it.foreach { v =>
+        s += v.user
+        ratingsNum += 1
+      }
+      Iterator((index, (ratingsNum, s.count(_ => true))))
+    }.collect
+
+    var ret = Map[Int, (Int, Int, Int)]()
+    var rowOffset = 0
+    collectd.foreach { v =>
+      val partitionId = v._1
+      val ratingsNum = v._2._1
+      val csrRowNum = v._2._2
+      ret += (partitionId -> (ratingsNum, csrRowNum, rowOffset))
+      rowOffset = rowOffset + csrRowNum
+    }
+
+    ret
+  }
+
   // Single entry to call Implict ALS DAL backend
-  @native private def cDALImplictALS(data: Long, 
+  @native private def cDALImplictALS(data: Long,
                                      nUsers: Long,
                                      nFactors: Int,
                                      maxIter: Int,
@@ -380,6 +271,7 @@ class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
                                      executor_cores: Int,
                                      rankId: Int,
                                      result: ALSResult): Long
+
   @native private def cShuffleData(data: ByteBuffer,
                                    nTotalKeys: Int,
                                    nBlocks: Int,
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
index 7581a1003..7ea7cb694 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
@@ -1,12 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,12 +24,12 @@ object OneCCL extends Logging {
 
   // Run on Executor
   def setExecutorEnv(): Unit = {
-    setEnv("CCL_ATL_TRANSPORT","ofi")
+    setEnv("CCL_ATL_TRANSPORT", "ofi")
     // Uncomment this if you whant to debug oneCCL
     // setEnv("CCL_LOG_LEVEL", "2")
   }
 
-  def init(executor_num: Int, rank: Int, ip_port: String) = {
+  def init(executor_num: Int, rank: Int, ip_port: String): Unit = {
 
     setExecutorEnv()
 
@@ -42,7 +41,8 @@ object OneCCL extends Logging {
     // executor number should equal to oneCCL world size
     assert(executor_num == cclParam.commSize, "executor number should equal to oneCCL world size")
 
-    logInfo(s"Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
+    logInfo(s"Initialized with executorNum: $executor_num, " +
+      s"commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
   }
 
   // Run on Executor
@@ -62,4 +62,4 @@ object OneCCL extends Logging {
 
   @native def setEnv(key: String, value: String, overwrite: Boolean = true): Int
   @native def c_getAvailPort(localIP: String): Int
-}
\ No newline at end of file
+}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
index 9b6c0f6c7..62a803dc3 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
@@ -1,12 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,14 +16,17 @@
 
 package org.apache.spark.ml.util
 
-import java.nio.DoubleBuffer
+import java.util.logging.{Level, Logger}
 
-import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable, RowMergedNumericTable, Matrix => DALMatrix}
+import com.intel.daal.data_management.data.{HomogenNumericTable, Matrix => DALMatrix, NumericTable,
+  RowMergedNumericTable}
 import com.intel.daal.services.DaalContext
-import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Vector, Vectors}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.{Vector => OldVector}
 import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD}
-import java.util.logging.{Logger, Level}
+import org.apache.spark.storage.StorageLevel
 
 object OneDAL {
 
@@ -49,7 +51,7 @@ object OneDAL {
     resArray
   }
 
-  def makeNumericTable (cData: Long) : NumericTable = {
+  def makeNumericTable(cData: Long): NumericTable = {
 
     val context = new DaalContext()
     val table = new HomogenNumericTable(context, cData)
@@ -57,7 +59,7 @@ object OneDAL {
     table
   }
 
-  def makeNumericTable (arrayVectors: Array[OldVector]): NumericTable = {
+  def makeNumericTable(arrayVectors: Array[OldVector]): NumericTable = {
 
     val numCols = arrayVectors.head.size
     val numRows: Int = arrayVectors.size
@@ -68,42 +70,64 @@ object OneDAL {
 
     arrayVectors.zipWithIndex.foreach {
       case (v, rowIndex) =>
-        for (colIndex <- 0 until numCols)
-        // matrix.set(rowIndex, colIndex, row.getString(colIndex).toDouble)
+        for (colIndex <- 0 until numCols) {
           setNumericTableValue(matrix.getCNumericTable, rowIndex, colIndex, v(colIndex))
+        }
     }
 
     matrix
   }
 
-  def rddVectorToNumericTables(vectors: RDD[Vector], executorNum: Int): RDD[Long] = {
-    // repartition to executorNum if not enough partitions
+  def releaseNumericTables(sparkContext: SparkContext): Unit = {
+    sparkContext.getPersistentRDDs
+      .filter(r => r._2.name == "numericTables")
+      .foreach { rdd =>
+        val numericTables = rdd._2.asInstanceOf[RDD[Long]]
+        numericTables.foreach { address =>
+          OneDAL.cFreeDataMemory(address)
+        }
+      }
+  }
+
+  def vectorsToMergedNumericTables(vectors: RDD[Vector], executorNum: Int): RDD[Long] = {
+    require(executorNum > 0)
+
+    logger.info(s"Processing partitions with $executorNum executors")
+
+    // Repartition to executorNum if not enough partitions
     val dataForConversion = if (vectors.getNumPartitions < executorNum) {
       vectors.repartition(executorNum).setName("Repartitioned for conversion").cache()
     } else {
       vectors
     }
 
+    // Get dimensions for each partition
     val partitionDims = Utils.getPartitionDims(dataForConversion)
 
-    // filter out empty partitions
-    val nonEmptyPartitions = dataForConversion.mapPartitionsWithIndex { (index: Int, it: Iterator[Vector]) =>
-      Iterator(Tuple3(partitionDims(index)._1, index, it))
-    }.filter { entry => { entry._1 > 0 }}
+    // Filter out empty partitions
+    val nonEmptyPartitions = dataForConversion.mapPartitionsWithIndex {
+      (index: Int, it: Iterator[Vector]) => Iterator(Tuple3(partitionDims(index)._1, index, it))
+    }.filter { entry => {
+      entry._1 > 0
+    }
+    }
 
+    // Convert to RDD[HomogenNumericTable]
     val numericTables = nonEmptyPartitions.map { entry =>
       val numRows = entry._1
       val index = entry._2
       val it = entry._3
       val numCols = partitionDims(index)._2
 
+      logger.info(s"Partition index: $index, numCols: $numCols, numRows: $numRows")
+
       // Build DALMatrix, this will load libJavaAPI, libtbb, libtbbmalloc
       val context = new DaalContext()
       val matrix = new DALMatrix(context, classOf[java.lang.Double],
         numCols.toLong, numRows.toLong, NumericTable.AllocationFlag.DoAllocate)
 
       // oneDAL libs should be loaded by now, loading other native libs
-      logger.log(logLevel, "IntelMLlib: Loading other native libraries ...")
+      logger.info("Loading native libraries")
       LibLoader.loadLibraries()
 
       var dalRow = 0
@@ -115,16 +139,17 @@ object OneDAL {
       }
 
       matrix.getCNumericTable
-    }.cache()
+    }.setName("numericTables").cache()
 
-    // workaroud to fix the bug of multi executors handling same partition.
-    numericTables.foreachPartition(() => _)
     numericTables.count()
 
-    val cachedRdds = vectors.sparkContext.getPersistentRDDs
-    cachedRdds.filter(r => r._2.name=="instancesRDD").foreach (r => r._2.unpersist())
+    // Unpersist instances RDD
+    if (vectors.getStorageLevel != StorageLevel.NONE) {
+      vectors.unpersist()
+    }
 
-    val coalescedRdd = numericTables.coalesce(1,
+    // Coalesce partitions belonging to the same executor
+    val coalescedRdd = numericTables.coalesce(executorNum,
       partitionCoalescer = Some(new ExecutorInProcessCoalescePartitioner()))
 
     val coalescedTables = coalescedRdd.mapPartitions { iter =>
@@ -144,12 +169,14 @@ object OneDAL {
 
   @native def cAddNumericTable(cObject: Long, numericTableAddr: Long)
 
-  @native def cSetDoubleBatch(numTableAddr: Long, curRows: Int, batch: Array[Double], numRows: Int, numCols: Int)
-  
+  @native def cSetDoubleBatch(numTableAddr: Long, curRows: Int, batch: Array[Double],
+                              numRows: Int, numCols: Int)
+
   @native def cFreeDataMemory(numTableAddr: Long)
 
-  @native def cCheckPlatformCompatibility() : Boolean
+  @native def cCheckPlatformCompatibility(): Boolean
 
-  @native def cNewCSRNumericTable(data: Array[Float], colIndices: Array[Long], rowOffsets: Array[Long], nFeatures: Long,
-                                  nVectors: Long) : Long
+  @native def cNewCSRNumericTable(data: Array[Float],
+                                  colIndices: Array[Long], rowOffsets: Array[Long],
+                                  nFeatures: Long, nVectors: Long): Long
 }
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
index 0dd43d24f..525afc78b 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
@@ -1,10 +1,26 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.util
 
 import java.net.InetAddress
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.rdd.RDD
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.rdd.RDD
 
 object Utils {
 
@@ -32,21 +48,6 @@ object Utils {
     ret
   }
 
-  def sparkExecutorNum(sc: SparkContext): Int = {
-
-    if (sc.master.contains("local"))
-      return 1
-
-    // Create empty partitions to start executors
-    sc.parallelize(Seq[Int]()).count()
-
-    // Get running executors infos
-    val executorInfos = sc.statusTracker.getExecutorInfos
-
-    // Return executor number (exclude driver)
-    executorInfos.length - 1
-  }
-
   def sparkExecutorCores(): Int = {
     val conf = new SparkConf(true)
 
@@ -63,18 +64,20 @@ object Utils {
     val info = sc.statusTracker.getExecutorInfos
     // get first executor, info(0) is driver
 
-    val host = if (sc.master.startsWith("local"))
+    val host = if (sc.master.startsWith("local")) {
       info(0).host()
-    else
+    } else {
       info(1).host()
+    }
     val ip = InetAddress.getByName(host).getHostAddress
     ip
   }
 
-  def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = {
+  def checkExecutorAvailPort(data: RDD[_], localIP: String): Int = {
 
     if (localIP == "127.0.0.1" || localIP == "127.0.1.1") {
-      println(s"\nOneCCL: Error: doesn't support loopback IP ${localIP}, please assign IP address to your host.\n")
+      println(s"\nOneCCL: Error: doesn't support loopback IP ${localIP}, " +
+        s"please assign IP address to your host.\n")
       System.exit(-1)
     }
 
@@ -82,21 +85,23 @@ object Utils {
     val result = data.mapPartitions { p =>
       LibLoader.loadLibraries()
       val port = OneCCL.getAvailPort(localIP)
-      if (port != -1)
+      if (port != -1) {
         Iterator(port)
-      else
+      } else {
         Iterator()
+      }
     }.collect()
 
-    return result(0)
+    result(0)
   }
 
-  def checkClusterPlatformCompatibility(sc: SparkContext) : Boolean = {
+  def checkClusterPlatformCompatibility(sc: SparkContext): Boolean = {
     LibLoader.loadLibraries()
 
     // check driver platform compatibility
-    if (!OneDAL.cCheckPlatformCompatibility())
+    if (!OneDAL.cCheckPlatformCompatibility()) {
       return false
+    }
 
     // check workers' platform compatibility
     val executor_num = Utils.sparkExecutorNum(sc)
@@ -106,6 +111,22 @@ object Utils {
       OneDAL.cCheckPlatformCompatibility()
     }.collect()
 
-    return result.forall( _ == true)
+    result.forall(_ == true)
+  }
+
+  def sparkExecutorNum(sc: SparkContext): Int = {
+
+    if (sc.master.contains("local")) {
+      return 1
+    }
+
+    // Create empty partitions to start executors
+    sc.parallelize(Seq[Int]()).count()
+
+    // Get running executors infos
+    val executorInfos = sc.statusTracker.getExecutorInfos
+
+    // Return executor number (exclude driver)
+    executorInfos.length - 1
   }
 }
diff --git a/mllib-dal/src/main/scala/org/apache/spark/rdd/ExecutorInProcessCoalescePartitioner.scala b/mllib-dal/src/main/scala/org/apache/spark/rdd/ExecutorInProcessCoalescePartitioner.scala
index 6a19990e8..12f045a79 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/rdd/ExecutorInProcessCoalescePartitioner.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/rdd/ExecutorInProcessCoalescePartitioner.scala
@@ -1,12 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2020 Intel Corporation
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,35 +16,28 @@
 
 package org.apache.spark.rdd
 
-import org.apache.commons.logging.LogFactory
-
-import org.apache.spark.Partition
-import org.apache.spark.SparkException
-import org.apache.spark.scheduler.ExecutorCacheTaskLocation
-import org.apache.spark.scheduler.TaskLocation
-
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.{Partition, SparkException}
+import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, TaskLocation}
+
 class ExecutorInProcessCoalescePartitioner
   extends PartitionCoalescer with Serializable {
 
   def coalesce(maxPartitions: Int, prev: RDD[_]): Array[PartitionGroup] = {
     val map = new mutable.HashMap[String, mutable.HashSet[Partition]]()
-
     val groupArr = ArrayBuffer[PartitionGroup]()
     prev.partitions.foreach(p => {
       val loc = prev.context.getPreferredLocs(prev, p.index)
-      loc.foreach{
-      case location : ExecutorCacheTaskLocation =>
-     
-        val execLoc = "executor_" + location.host + "_" + location.executorId
-        val partValue = map.getOrElse(execLoc, new mutable.HashSet[Partition]())
-        partValue.add(p)
-        map.put(execLoc, partValue)
-      case loc : TaskLocation =>
-	    throw new SparkException("Invalid location !!!")
-
+      loc.foreach {
+        case location : ExecutorCacheTaskLocation =>
+          val execLoc = "executor_" + location.host + "_" + location.executorId
+          val partValue = map.getOrElse(execLoc, new mutable.HashSet[Partition]())
+          partValue.add(p)
+          map.put(execLoc, partValue)
+        case _ : TaskLocation =>
+          throw new SparkException("ExecutorInProcessCoalescePartitioner: Invalid task location!")
       }
     })
     map.foreach(x => {
@@ -54,12 +46,13 @@ class ExecutorInProcessCoalescePartitioner
       list.foreach(part => pg.partitions += part)
       groupArr += pg
     })
-    if (groupArr.length == 0) throw new SparkException("No partitions or" +
-      " no locations for partitions found.")
+    if (groupArr.length == 0) {
+      throw new SparkException(
+        "ExecutorInProcessCoalescePartitioner: No partitions or no locations for partitions found.")
+    }
 
     val sortedGroupArr = groupArr.sortWith(_.partitions(0).index < _.partitions(0).index)
 
-    return sortedGroupArr.toArray
+    sortedGroupArr.toArray
   }
 }
-
diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh
index 63ae4eccb..d7df508fe 100755
--- a/mllib-dal/test.sh
+++ b/mllib-dal/test.sh
@@ -27,7 +27,7 @@ if [[ -z $CCL_ROOT ]]; then
 fi
 
 if [[ -z $1 ]]; then
- echo SPARK_VER not defined, using default (3.0.0).
+ echo SPARK_VER not defined, using default version spark-3.0.0.
 else
  SPARK_VER=$1
 fi
@@ -42,13 +42,8 @@ echo Clang Version: $(clang -dumpversion)
 echo SPARK_VER=$SPARK_VER
 echo =============================
 
-# Enable signal chaining support for JNI
-# export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
-
-# -Dtest=none to turn off the Java tests
-
-# Test all
-# mvn -Dtest=none -Dmaven.test.skip=false test
+# Clean
+mvn clean
 
 # Individual test
 if [[ -z $SPARK_VER ]]; then
@@ -56,7 +51,7 @@ if [[ -z $SPARK_VER ]]; then
  mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
 # mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
 else
- mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test -P$SPARK_VER
- mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test -P$SPARK_VER
-# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test -P$SPARK_VER
+ mvn -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
+ mvn -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
+# mvn -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
 fi