From 0ce4d1e9427fe7cb972cd278a2e18c5c7a4ff7d2 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Fri, 27 Oct 2023 12:46:16 +0530
Subject: [PATCH 1/2] fix(ingest/bigquery): use correct row count in null count
 profiling computation

---
 .../src/datahub/ingestion/source/ge_data_profiler.py     | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index 9f6ac9dd21164..46cc4c8b26c31 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -781,7 +781,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
             sample_pc = 100 * self.config.sample_size / profile.rowCount
             sql = (
                 f"SELECT * FROM {str(self.dataset._table)} "
-                + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
+                + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)"
             )
             temp_table_name = create_bigquery_temp_table(
                 self,
@@ -793,6 +793,13 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
                 self.dataset._table = sa.text(temp_table_name)
                 logger.debug(f"Setting table name to be {self.dataset._table}")
 
+                # We can alternatively use `self._get_dataset_rows(profile)` to get
+                # exact count of rows in sample, as actual rows involved in sample
+                # may be slightly different (more or less) than configured `sample_size`.
+                # However not doing so to start with, as that adds another query overhead
+                # plus approximate metrics should work for sampling based profiling.
+                profile.rowCount = self.config.sample_size
+
                 if (
                     profile.partitionSpec
                     and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE

From f3cd6d4429c2d00c511b7c3c66f74fe6c7f2b071 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Mon, 30 Oct 2023 11:31:43 +0530
Subject: [PATCH 2/2] update only local row count assignment

---
 .../ingestion/source/ge_data_profiler.py       | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index 46cc4c8b26c31..4998a96d8e5f0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -629,7 +629,16 @@ def generate_dataset_profile(  # noqa: C901 (complexity)
         self.query_combiner.flush()
 
         assert profile.rowCount is not None
-        row_count: int = profile.rowCount
+        row_count: int  # used for null counts calculation
+        if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
+            # We can alternatively use `self._get_dataset_rows(profile)` to get
+            # exact count of rows in sample, as actual rows involved in sample
+            # may be slightly different (more or less) than configured `sample_size`.
+            # However not doing so to start with, as that adds another query overhead
+            # plus approximate metrics should work for sampling based profiling.
+            row_count = self.config.sample_size
+        else:
+            row_count = profile.rowCount
 
         for column_spec in columns_profiling_queue:
             column = column_spec.column
@@ -793,13 +802,6 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
                 self.dataset._table = sa.text(temp_table_name)
                 logger.debug(f"Setting table name to be {self.dataset._table}")
 
-                # We can alternatively use `self._get_dataset_rows(profile)` to get
-                # exact count of rows in sample, as actual rows involved in sample
-                # may be slightly different (more or less) than configured `sample_size`.
-                # However not doing so to start with, as that adds another query overhead
-                # plus approximate metrics should work for sampling based profiling.
-                profile.rowCount = self.config.sample_size
-
                 if (
                     profile.partitionSpec
                     and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE