Skip to content

Commit

Permalink
[DCS-204] add spark integration support
Browse files Browse the repository at this point in the history
  • Loading branch information
Subhankar authored and Subhankar committed Jul 10, 2024
1 parent 2ec343d commit c29295c
Show file tree
Hide file tree
Showing 41 changed files with 2,766 additions and 1,640 deletions.
2 changes: 1 addition & 1 deletion datachecks/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from datachecks.__version__ import __version__
from datachecks.core import Configuration, Inspect
from datachecks.core.configuration.configuration_parser_v1 import load_configuration
from datachecks.core.configuration.configuration_parser import load_configuration

# from datachecks.core.common.models.metric import DataSourceMetrics
from datachecks.core.inspect import InspectOutput
Expand Down
24 changes: 20 additions & 4 deletions datachecks/core/common/models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from dataclasses import dataclass
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

from datachecks.core.common.models.data_source_resource import Field, Index, Table
from datachecks.core.common.models.metric import MetricsType
Expand All @@ -36,6 +36,7 @@ class DataSourceType(str, Enum):
REDSHIFT = "redshift"
SNOWFLAKE = "snowflake"
DATABRICKS = "databricks"
SPARK_DF = "spark_df"


class DataSourceLanguageSupport(str, Enum):
Expand Down Expand Up @@ -70,6 +71,8 @@ class DataSourceConnectionConfiguration:

driver: Optional[str] = None # SQL Server specific configuration

spark_session: Optional[Any] = None # Spark specific configuration


@dataclass
class DataSourceConfiguration:
Expand Down Expand Up @@ -223,7 +226,20 @@ class Configuration:
Configuration for the data checks
"""

data_sources: Optional[Dict[str, DataSourceConfiguration]] = None
validations: Optional[Dict[str, ValidationConfigByDataset]] = None
data_sources: Optional[Dict[str, DataSourceConfiguration]] = field(
default_factory=dict
)
validations: Optional[Dict[str, ValidationConfigByDataset]] = field(
default_factory=dict
)
metrics: Optional[Dict[str, MetricConfiguration]] = None
storage: Optional[MetricStorageConfiguration] = None

def add_spark_session(self, data_source_name: str, spark_session):
self.data_sources[data_source_name] = DataSourceConfiguration(
name=data_source_name,
type=DataSourceType.SPARK_DF,
connection_config=DataSourceConnectionConfiguration(
spark_session=spark_session
),
)
Loading

0 comments on commit c29295c

Please sign in to comment.