diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 9f6ac9dd21164..4998a96d8e5f0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -629,7 +629,16 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() assert profile.rowCount is not None - row_count: int = profile.rowCount + row_count: int # used for null counts calculation + if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: + # We can alternatively use `self._get_dataset_rows(profile)` to get + # exact count of rows in sample, as actual rows involved in sample + # may be slightly different (more or less) than configured `sample_size`. + # However not doing so to start with, as that adds another query overhead + # plus approximate metrics should work for sampling based profiling. + row_count = self.config.sample_size + else: + row_count = profile.rowCount for column_spec in columns_profiling_queue: column = column_spec.column @@ -781,7 +790,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non sample_pc = 100 * self.config.sample_size / profile.rowCount sql = ( f"SELECT * FROM {str(self.dataset._table)} " - + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)" + + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)" ) temp_table_name = create_bigquery_temp_table( self,