Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Commit

Permalink
[SQL-DS-CACHE-36][POAE7-898]HCFS docs for OAP 1.1 (#37)
Browse files Browse the repository at this point in the history
* [SQL-DS-CACHE-36][POAE7-898]HCFS docs for OAP 1.1

* address comment

* address comment

* Replace white/black with allow/deny
  • Loading branch information
xieqi authored Mar 19, 2021
1 parent f0cd386 commit d98a0d7
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ public class CachedInputStream extends FSInputStream {
private int cacheHitCount = 0;
private List<PMemBlock> cachedBlocks = new ArrayList<>();

// white list and black list regular expressions that decide whether to cache or not
private String cacheWhiteListRegexp;
private String cacheBlackListRegexp;
// allow list and deny list regular expressions that decide whether to cache or not
private String cacheAllowListRegexp;
private String cacheDenyListRegexp;
private boolean fileShouldBeCached;

public CachedInputStream(FSDataInputStream hdfsInputStream, Configuration conf,
Expand All @@ -93,22 +93,22 @@ public CachedInputStream(FSDataInputStream hdfsInputStream, Configuration conf,
this.statisticsStore = new RedisGlobalPMemCacheStatisticsStore(conf);
this.ids = new ObjectId[(int)((contentLength + pmemCachedBlockSize - 1) / pmemCachedBlockSize)];

cacheWhiteListRegexp = conf.get(Constants.CONF_KEY_CACHE_WHITE_LIST_REGEXP,
Constants.DEFAULT_CACHE_WHITE_LIST_REGEXP);
cacheAllowListRegexp = conf.get(Constants.CONF_KEY_CACHE_ALLOW_LIST_REGEXP,
Constants.DEFAULT_CACHE_ALLOW_LIST_REGEXP);

cacheBlackListRegexp = conf.get(Constants.CONF_KEY_CACHE_BLACK_LIST_REGEXP,
Constants.DEFAULT_CACHE_BLACK_LIST_REGEXP);
cacheDenyListRegexp = conf.get(Constants.CONF_KEY_CACHE_DENY_LIST_REGEXP,
Constants.DEFAULT_CACHE_DENY_LIST_REGEXP);

fileShouldBeCached = checkFileShouldBeCached();

LOG.info("Opening file: {} for reading. fileShouldBeCached: {}", path, fileShouldBeCached);
}

private boolean checkFileShouldBeCached() {
return (cacheWhiteListRegexp.isEmpty()
|| Pattern.compile(cacheWhiteListRegexp).matcher(path.toString()).find())
&& (cacheBlackListRegexp.isEmpty()
|| !Pattern.compile(cacheBlackListRegexp).matcher(path.toString()).find());
return (cacheAllowListRegexp.isEmpty()
|| Pattern.compile(cacheAllowListRegexp).matcher(path.toString()).find())
&& (cacheDenyListRegexp.isEmpty()
|| !Pattern.compile(cacheDenyListRegexp).matcher(path.toString()).find());
}

private void advanceCachePosition(long pos) {
Expand Down Expand Up @@ -243,7 +243,7 @@ private boolean ensureDataInCache() throws IOException {
LOG.warn("exception, data not cached to pmem for block: {}", currentBlock);
}
} else {
LOG.debug("data will not be cached since it's in blacklist or it's already cached: {}",
LOG.debug("data will not be cached since it's in denylist or it's already cached: {}",
currentBlock);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,17 @@ public class Constants {
public static final String CACHE_LOCATION_POLICY_HDFS_ONLY = "hdfs_only";

// regular expression that contains patterns of paths which will be cached.
// files will not be cached when their paths match black list regexp.
// files will not be cached when their paths match deny list regexp.
// an empty regexp results in matching everything.
// eg. cachedFs://localhost:9000/dir/
public static final String CONF_KEY_CACHE_WHITE_LIST_REGEXP = "fs.cachedFs.whiteList.regexp";
public static final String CONF_KEY_CACHE_ALLOW_LIST_REGEXP = "fs.cachedFs.allowlist.regexp";

public static final String DEFAULT_CACHE_WHITE_LIST_REGEXP = ".*";
public static final String DEFAULT_CACHE_ALLOW_LIST_REGEXP = ".*";

// regular expression that contains patterns of paths which will not be cached.
// an empty regexp results in no matching of black list.
// an empty regexp results in no matching of deny list.
// eg. io_data|io_control
public static final String CONF_KEY_CACHE_BLACK_LIST_REGEXP = "fs.cachedFs.blacklist.regexp";
public static final String CONF_KEY_CACHE_DENY_LIST_REGEXP = "fs.cachedFs.denylist.regexp";

public static final String DEFAULT_CACHE_BLACK_LIST_REGEXP = "";
public static final String DEFAULT_CACHE_DENY_LIST_REGEXP = "";
}
66 changes: 66 additions & 0 deletions docs/HCFS-User-Guide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# HCFS User Guide

* [Prerequisites](#prerequisites)
* [Configurations](#configuration)

## Prerequisites

HCFS based Data Source Cache on Spark 3.0.0 requires a working Hadoop cluster with YARN and Spark. Running Spark on YARN requires a binary distribution of Spark, which is built with YARN support. The HCFS based Data Source Cache also need to install plasma and redis, please follow [OAP-Installation-Guide](OAP-Installation-Guide.md) for how to install plasma and redis.

## Configurations

### Spark Configurations

Before you run `$SPARK_HOME/bin/spark-shell `, you need to configure Spark for integration. You need to add or update the following configurations in the Spark configuration file `$SPARK_HOME/conf/spark-defaults.conf` on your working node.

```bash
spark.hadoop.fs.cachedFs.impl com.intel.oap.fs.hadoop.cachedfs.CachedFileSystem
# absolute path of the jar on your working node
spark.files /path/to/hcfs-sql-ds-cache-<version>.jar
# relative path to spark.files, just specify jar name in current dir
spark.executor.extraClassPath ./hcfs-sql-cache-<version>.jar
# absolute path of the jar on your working node
spark.driver.extraClassPath /path/to/hcfs-sql-ds-cache-<version>.jar
```

### Redis Configuration

Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.

```
spark.hadoop.fs.cachedFs.redis.host $HOST
spark.hadoop.fs.cachedFs.redis.port $PORT
```

### Configuration for HCFS cache location policy

We provide three HCFS cache location policies, you can choose the best one for you workload
* defalut policy
This policy the file block locations consist of cached blocks and hdfs blocks (if cached blocks are incomplete)
* cache_over_hdfs
This policy use cached block location only if all requested content is cached, otherwise use HDFS block locations
* hdfs_only
This policy will ignoring cached blocks when finding file block locations

Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.

```
spark.hadoop.fs.cachedFs.blockLocation.policy default or cache_over_hdfs or hdfs_only
```

## Configuration for HCFS cache path pattern

We provide HCFS cache patterns for paths to determine wherthe path will be cached
* allowlist
The path match the pattens will be cached. An empty regexp results in matching everything.
eg. cachedFs://localhost:9000/dir/
* denylist
The path match the pattens will not be cached. An empty regexp results in no matching of deny list.
eg. io_data|io_control

Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.

```
spark.hadoop.fs.cachedFs.allowList.regexp $PATTEN
spark.hadoop.fs.cachedFs.denylist.regexp $PATTERN
```

0 comments on commit d98a0d7

Please sign in to comment.