Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add lake formation support in catalog #810

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 207 additions & 2 deletions framework/API.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

import * as cdk from 'aws-cdk-lib';
import { Bucket } from 'aws-cdk-lib/aws-s3';
import { Construct } from 'constructs';
import * as dsf from '../../index';

/// !show
class ExampleDefaultDataCatalogDatabaseStack extends cdk.Stack {
constructor(scope: Construct, id: string) {
super(scope, id);
const bucket = new Bucket(this, 'DataCatalogBucket');

new dsf.governance.DataCatalogDatabase(this, 'DataCatalogDatabase', {
locationBucket: bucket,
locationPrefix: '/databasePath',
name: 'example-db',
permissionModel: dsf.utils.PermissionModel.LAKE_FORMATION,
});
}
}
/// !hide

const app = new cdk.App();
new ExampleDefaultDataCatalogDatabaseStack(app, 'ExampleDefaultDataCatalogDatabaseStack');
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

import * as cdk from 'aws-cdk-lib';
import { Construct } from 'constructs';
import * as dsf from '../../index';

/// !show
class ExampleDefaultDataLakeCatalogStack extends cdk.Stack {
constructor(scope: Construct, id: string) {
super(scope, id);
const storage = new dsf.storage.DataLakeStorage(this, 'MyDataLakeStorage');

new dsf.governance.DataLakeCatalog(this, 'DataCatalog', {
dataLakeStorage: storage,
permissionModel: dsf.utils.PermissionModel.LAKE_FORMATION,
});
}
}
/// !hide

const app = new cdk.App();
new ExampleDefaultDataLakeCatalogStack(app, 'ExampleDefaultDataLakeCatalogStack');
24 changes: 22 additions & 2 deletions framework/src/governance/lib/data-catalog-database-props.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { IRole } from 'aws-cdk-lib/aws-iam';
import { IKey } from 'aws-cdk-lib/aws-kms';
import { IBucket } from 'aws-cdk-lib/aws-s3';
import { ISecret } from 'aws-cdk-lib/aws-secretsmanager';
import { PermissionModel } from '../../utils';

/**
* Properties for the `DataCatalogDatabase` construct
Expand All @@ -24,8 +25,7 @@ export interface DataCatalogDatabaseProps {

/**
* Top level location where table data is stored.
* The location prefix cannot be empty if the `locationBucket` is set.
* The minimal configuration is `/` for the root level in the Bucket.
* @default - the root of the bucket is used as the location prefix.
*/
readonly locationPrefix?: string;

Expand Down Expand Up @@ -87,4 +87,24 @@ export interface DataCatalogDatabaseProps {
* @default - The resources are not deleted (`RemovalPolicy.RETAIN`).
*/
readonly removalPolicy?: RemovalPolicy;

/**
* The permission model to apply to the Glue Database.
* @default - IAM permission model is used
*/
readonly permissionModel?: PermissionModel;

/**
* The IAM Role used by Lake Formation for data access.
* Only needed when permissionModel is set to Lake Formation or Hybrid
* @default - A new role is created
*/
readonly lakeFormationDataAccessRole?: IRole;

/**
* The IAM Role used to perform Lake Formation configuration.
* Only needed when permissionModel is set to Lake Formation or Hybrid
* @default - A new role is created
*/
readonly lakeFormationConfigurationRole?: IRole;
}
177 changes: 161 additions & 16 deletions framework/src/governance/lib/data-catalog-database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@

import { Stack } from 'aws-cdk-lib';
import { CfnCrawler, CfnDatabase, CfnSecurityConfiguration } from 'aws-cdk-lib/aws-glue';
import { AddToPrincipalPolicyResult, Effect, IPrincipal, IRole, PolicyDocument, PolicyStatement, Role, ServicePrincipal } from 'aws-cdk-lib/aws-iam';
import { AddToPrincipalPolicyResult, Effect, IPrincipal, IRole, Policy, PolicyDocument, PolicyStatement, Role, ServicePrincipal } from 'aws-cdk-lib/aws-iam';
import { IKey, Key } from 'aws-cdk-lib/aws-kms';
import { CfnDataLakeSettings, CfnPrincipalPermissions, CfnResource } from 'aws-cdk-lib/aws-lakeformation';
import { AwsCustomResource } from 'aws-cdk-lib/custom-resources';
import { Construct } from 'constructs';
import { DataCatalogDatabaseProps } from './data-catalog-database-props';
import { Context, TrackedConstruct, TrackedConstructProps, Utils } from '../../utils';
import { /*grantDataLakeLocation,*/ grantCrawler, grantDataLakeLocation, putDataLakeSettings, registerS3Location, revokeIamAllowedPrincipal } from './lake-formation-helpers';
import { Context, PermissionModel, TrackedConstruct, TrackedConstructProps, Utils } from '../../utils';

/**
* An AWS Glue Data Catalog Database configured with the location and a crawler.
Expand Down Expand Up @@ -47,10 +50,55 @@ export class DataCatalogDatabase extends TrackedConstruct {
* KMS encryption Key used by the Crawler
*/
readonly crawlerLogEncryptionKey?: IKey;
/**
* The DataLakeSettings for Lake Formation
*/
readonly dataLakeSettings?: CfnDataLakeSettings;
/**
* The IAM Role used by Lake Formation to access data.
*/
readonly lfDataAccessRole?: IRole;
/**
* The Lake Formation data lake location
*/
readonly dataLakeLocation?: CfnResource;
/**
* The custom resource for revoking IAM permissions from the database
*/
readonly revokeIamAllowedPrincipal?: AwsCustomResource;
/**
* The Lake Formation grant on the database for the Crawler when Lake Formation or Hybrid is used
*/
readonly crawlerLfDbGrant?: CfnPrincipalPermissions;
/**
* The Lake Formation grant on the tables for the Crawler when Lake Formation or Hybrid is used
*/
readonly crawlerLfTablesGrant?: CfnPrincipalPermissions;
/**
* The Lake Formation grant on the data location for the Crawler when Lake Formation or Hybrid is used
*/
readonly crawlerLfLocationGrant?: CfnPrincipalPermissions;
/**
* The IAM Role used to revoke LakeFormation IAMAllowedPrincipals
*/
readonly lfRevokeRole?: IRole;
/**
* The Lake Formation grant on the data location for the CDK role
*/
readonly cdkLfLocationGrant?: CfnPrincipalPermissions;
/**
* Caching constructor properties for internal reuse by constructor methods
*/
private dataCatalogDatabaseProps: DataCatalogDatabaseProps;
/**
* The location prefix without trailing slash
*/
private cleanedLocationPrefix?: string;
/**
* The location S3 URI
*/
private s3LocationUri?: string;


constructor(scope: Construct, id: string, props: DataCatalogDatabaseProps) {
const trackedConstructProps: TrackedConstructProps = {
Expand All @@ -70,25 +118,71 @@ export class DataCatalogDatabase extends TrackedConstruct {
const hash = Utils.generateUniqueHash(this);
this.databaseName = props.name + '_' + hash.toLowerCase();

let s3LocationUri: string|undefined, locationPrefix: string|undefined;

if (catalogType === CatalogType.S3) {
locationPrefix = props.locationPrefix;

if (!locationPrefix!.endsWith('/')) {
locationPrefix += '/';
this.cleanedLocationPrefix = props.locationPrefix === undefined ? '' : props.locationPrefix.replace(/\/$/g, '');
this.s3LocationUri = props.locationBucket!.s3UrlForObject(this.cleanedLocationPrefix);

if (props.permissionModel === PermissionModel.LAKE_FORMATION || props.permissionModel === PermissionModel.HYBRID) {

const lfAdmins: IRole[]=[];
const cdkRole = Utils.getCdkDeploymentRole(this);
lfAdmins.push(cdkRole);

if (props.permissionModel === PermissionModel.LAKE_FORMATION) {
// Create a role for the AwsCustomResource to revoke IAMAllowedPrincipal
this.lfRevokeRole = props.lakeFormationConfigurationRole || new Role(this, 'LfRevokeRole', {
assumedBy: new ServicePrincipal('lambda.amazonaws.com'),
});
lfAdmins.push(this.lfRevokeRole);
}

this.dataLakeSettings = putDataLakeSettings(this, 'DataLakeSettings', lfAdmins);

// register location
if (props.locationBucket) {

[this.lfDataAccessRole, this.dataLakeLocation] = registerS3Location(
this, 'LakeFormationRegistration',
props.locationBucket,
this.cleanedLocationPrefix,
props.permissionModel,
props.lakeFormationDataAccessRole,
);
this.lfDataAccessRole.node.addDependency(this.dataLakeSettings!);

// this.cdkLfLocationGrant = grantDataLakeLocation(
// this, 'CdkLfLocationGrant',
// this.dataLakeLocation!.resourceArn,
// cdkRole,
// true
// );
// this.cdkLfLocationGrant.node.addDependency(this.dataLakeLocation!);
}
}

s3LocationUri = props.locationBucket!.s3UrlForObject(locationPrefix);
}

this.database = new CfnDatabase(this, 'GlueDatabase', {
catalogId: Stack.of(this).account,
databaseInput: {
name: this.databaseName,
locationUri: s3LocationUri,
locationUri: this.s3LocationUri,
},
});
this.database.applyRemovalPolicy(removalPolicy);

if (catalogType === CatalogType.S3
&& (props.permissionModel === PermissionModel.LAKE_FORMATION || props.permissionModel === PermissionModel.HYBRID)) {

// this.database.node.addDependency(this.dataLakeLocation!);
// this.database.node.addDependency(this.cdkLfLocationGrant!);

if (props.permissionModel === PermissionModel.LAKE_FORMATION) {

this.revokeIamAllowedPrincipal = revokeIamAllowedPrincipal(this, 'IamRevoke', this.databaseName, this.lfRevokeRole!, removalPolicy);
this.revokeIamAllowedPrincipal.node.addDependency(this.database);
}
}

let autoCrawl = props.autoCrawl;

Expand Down Expand Up @@ -211,12 +305,12 @@ export class DataCatalogDatabase extends TrackedConstruct {
const crawlerName = `${props.name}-${hash.toLowerCase()}-crawler`;

if (catalogType === CatalogType.S3) {
this.crawler = this.handleS3TypeCrawler(props, {
[this.crawler, this.crawlerLfDbGrant, this.crawlerLfTablesGrant, this.crawlerLfLocationGrant] = this.handleS3TypeCrawler(props, {
autoCrawlSchedule,
crawlerName,
crawlerSecurityConfigurationName: this.crawlerSecurityConfiguration.name,
locationPrefix: locationPrefix!,
s3LocationUri: s3LocationUri!,
locationPrefix: this.cleanedLocationPrefix!,
s3LocationUri: this.s3LocationUri!,
});
} else if (catalogType === CatalogType.JDBC) {
this.crawler = this.handleJDBCTypeCrawler(props, {
Expand Down Expand Up @@ -307,12 +401,51 @@ export class DataCatalogDatabase extends TrackedConstruct {
* @param s3Props `S3CrawlerProps`
* @returns `CfnCrawler`
*/
private handleS3TypeCrawler(props: DataCatalogDatabaseProps, s3Props: S3CrawlerProps): CfnCrawler {
private handleS3TypeCrawler(
props: DataCatalogDatabaseProps,
s3Props: S3CrawlerProps,
): [CfnCrawler, CfnPrincipalPermissions | undefined, CfnPrincipalPermissions | undefined, CfnPrincipalPermissions | undefined] {

const tableLevel = props.crawlerTableLevelDepth || this.calculateDefaultTableLevelDepth(s3Props.locationPrefix);
const grantPrefix = s3Props.locationPrefix == '/' ? '' : s3Props.locationPrefix;
props.locationBucket!.grantRead(this.crawlerRole!, grantPrefix+'*');

return new CfnCrawler(this, 'DatabaseAutoCrawler', {
let useLakeFormation = false;
let lfDbGrant: CfnPrincipalPermissions | undefined;
let lfTablesGrant: CfnPrincipalPermissions | undefined;
let lfLocationGrant: CfnPrincipalPermissions | undefined;

if (props.permissionModel === PermissionModel.HYBRID || props.permissionModel === PermissionModel.LAKE_FORMATION) {
useLakeFormation = true;

this.crawlerRole!.attachInlinePolicy(new Policy(this, 'CrawlerLfDataAccess', {
statements: [
new PolicyStatement({
effect: Effect.ALLOW,
actions: [
'lakeformation:GetDataAccess',
],
resources: ['*'],
}),
],
}));

lfLocationGrant = grantDataLakeLocation(
this, 'CrawlerLfLocationGrant',
props.locationBucket!.arnForObjects(this.cleanedLocationPrefix || ''),
this.crawlerRole!,
);

[lfDbGrant, lfTablesGrant] = grantCrawler(this, 'DbCrawler', this.databaseName, this.crawlerRole!);

lfLocationGrant.node.addDependency(this.dataLakeLocation!);
// lfLocationGrant.node.addDependency(this.cdkLfLocationGrant!);
lfDbGrant.node.addDependency(this.database);
lfTablesGrant.node.addDependency(this.database);
} else {
props.locationBucket!.grantRead(this.crawlerRole!, grantPrefix+'*');
}

const crawler = new CfnCrawler(this, 'DatabaseAutoCrawler', {
role: this.crawlerRole!.roleArn,
targets: {
s3Targets: [{
Expand All @@ -329,7 +462,19 @@ export class DataCatalogDatabase extends TrackedConstruct {
TableLevelConfiguration: tableLevel,
},
}),
lakeFormationConfiguration: {
useLakeFormationCredentials: useLakeFormation,
},
});
crawler.node.addDependency(this.database);

if (props.permissionModel === PermissionModel.HYBRID || props.permissionModel === PermissionModel.LAKE_FORMATION) {
crawler.node.addDependency(lfDbGrant!);
crawler.node.addDependency(lfTablesGrant!);
crawler.node.addDependency(lfLocationGrant!);
}

return [crawler, lfDbGrant, lfTablesGrant, lfLocationGrant];
}

/**
Expand Down
22 changes: 22 additions & 0 deletions framework/src/governance/lib/data-lake-catalog-props.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import { RemovalPolicy } from 'aws-cdk-lib';
import { CfnCrawler } from 'aws-cdk-lib/aws-glue';
import { IKey } from 'aws-cdk-lib/aws-kms';
import { DataLakeStorage } from '../../storage';
import { PermissionModel } from '../../utils';
import { IRole } from 'aws-cdk-lib/aws-iam';

/**
* Properties for the `DataLakeCatalog` Construct
Expand Down Expand Up @@ -54,4 +56,24 @@ export interface DataLakeCatalogProps {
* @default - The resources are not deleted (`RemovalPolicy.RETAIN`).
*/
readonly removalPolicy?: RemovalPolicy;

/**
* The permission model to apply to the Glue Database.
* @default - IAM permission model is used
*/
readonly permissionModel?: PermissionModel;

/**
* The IAM Role used by Lake Formation for data access.
* Only needed when permissionModel is set to Lake Formation or Hybrid
* @default - A new role is created for the entire Data Lake
*/
readonly lakeFormationDataAccessRole?: IRole;

/**
* The IAM Role used to perform Lake Formation configuration.
* Only needed when permissionModel is set to Lake Formation or Hybrid
* @default - A new role is created for the entire Data Lake
*/
readonly lakeFormationConfigurationRole?: IRole;
}
Loading
Loading