fluid-cloudnative · cheyang · May 11, 2021 · Apr 16, 2021 · Apr 16, 2021 · Apr 16, 2021
diff --git a/api/v1alpha1/dataload_types.go b/api/v1alpha1/dataload_types.go
@@ -53,6 +53,12 @@ type DataLoadSpec struct {
 
 	// Target defines target paths that needs to be loaded
 	Target []TargetPath `json:"target,omitempty"`
+
+	// LoadMemoryData specifies if the dataload job should load memory or not
+	LoadMemoryData bool `json:"loadMemoryData,omitempty"`
+
+	// add HdfsConfig for JindoRuntime
+	HdfsConfig string `json:"hdfsConfig,omitempty"`
 }
 
 // DataLoadStatus defines the observed state of DataLoad

diff --git a/charts/fluid-dataloader/CHANGELOG.md → charts/fluid-dataloader/alluxio/CHANGELOG.md b/charts/fluid-dataloader/CHANGELOG.md → charts/fluid-dataloader/alluxio/CHANGELOG.md
diff --git a/charts/fluid-dataloader/Chart.yaml → charts/fluid-dataloader/alluxio/Chart.yaml b/charts/fluid-dataloader/Chart.yaml → charts/fluid-dataloader/alluxio/Chart.yaml
diff --git a/charts/fluid-dataloader/README.md → charts/fluid-dataloader/alluxio/README.md b/charts/fluid-dataloader/README.md → charts/fluid-dataloader/alluxio/README.md
diff --git a/...fluid-dataloader/templates/configmap.yaml → ...taloader/alluxio/templates/configmap.yaml b/...fluid-dataloader/templates/configmap.yaml → ...taloader/alluxio/templates/configmap.yaml
diff --git a/...luid-dataloader/templates/dataloader.yaml → ...aloader/alluxio/templates/dataloader.yaml b/...luid-dataloader/templates/dataloader.yaml → ...aloader/alluxio/templates/dataloader.yaml
diff --git a/charts/fluid-dataloader/values.yaml → charts/fluid-dataloader/alluxio/values.yaml b/charts/fluid-dataloader/values.yaml → charts/fluid-dataloader/alluxio/values.yaml
diff --git a/charts/fluid-dataloader/jindo/CHANGELOG.md b/charts/fluid-dataloader/jindo/CHANGELOG.md
@@ -0,0 +1,4 @@
+### 0.1.0
+
+- Support parallel prefetch job
+- Support configurations by setting values
diff --git a/charts/fluid-dataloader/jindo/Chart.yaml b/charts/fluid-dataloader/jindo/Chart.yaml
@@ -0,0 +1,23 @@
+apiVersion: v2
+name: fluid-dataloader
+description: A Helm chart for Fluid to prefetch data
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+appVersion: 0.1.0
diff --git a/charts/fluid-dataloader/jindo/README.md b/charts/fluid-dataloader/jindo/README.md
@@ -0,0 +1,86 @@
+# fluid-dataloader
+
+## Prerequisite
+- Dataset deployed
+- Jindo Runtime deployed
+- Dataset mountPoint mounted
+- Dataset-related PV, PVC created
+
+## Install
+1. get dataset-related PVC name
+```shell script
+$ kubectl get pvc
+NAME         STATUS   VOLUME       CAPACITY   ACCESS MODES   STORAGECLASS   AGE
+<pvc-name>   Bound    <pv-name>    100Gi      RWX                           4h5m
+```
+Say `<pvc-name>` is the name of your dataset-related PVC, usually it's the same name as your dataset.
+
+2. get num of Jindo workers
+```shell script
+kubectl get pod -l release=<dataset-name> | grep -c "worker"
+```
+
+3. Install fluid-dataloader
+
+```shell script
+helm install \
+  --set dataloader.numWorker=<num-of-workers> \
+  --set dataloader.threads=2 \
+  <pvc-name>-load charts/fluid-dataloader
+```
+
+You will see something like this:
+```
+helm install hbase-load charts/fluid-dataloader/
+NAME: <pvc-name>-load
+LAST DEPLOYED: Fri Jul 31 19:52:11 2020
+NAMESPACE: default
+STATUS: deployed
+REVISION: 1
+TEST SUITE: None
+```
+
+Some dataloader jobs will be launched. You will see multiple jobs running on different nodes:
+```shell script
+kubectl get pod -o wide -l role=Jindo-dataloader
+```
+
+Once some job completes, you can check time consumed during data prefetch:
+```shell script
+kubectl logs <pvc-name>-loader-xxxxx
+```
+and see something like this:
+```
+THREADS=2
+DATAPATH=/data/*
+python multithread_read_benchmark.py --threads=2 --path=/data/*
+/data/* contains 15 items
+/data/* processing 15 items with 2 threads uses 32.6712441444s, avg 0.459119338513/s, avg 8743748.5924B/s, avg 8.33868846169MiB/s
+```
+
+Now then, all data should be cached, reinstall it:
+```shell script
+helm del <pvc-name>
+
+helm install \
+  --set dataloader.numWorker=<num-of-workers> \
+  --set dataloader.threads=2 \
+  <pvc-name>-load charts/fluid-dataloader
+```
+
+check again, and this time should be much faster:
+```shell script
+kubectl logs <pvc-name>-loader-yyyyy
+```
+```
+THREADS=2
+DATAPATH=/data/*
+python multithread_read_benchmark.py --threads=2 --path=/data/*
+/data/* contains 15 items
+/data/* processing 15 items with 2 threads uses 0.308158159256s, avg 48.6763032211/s, avg 927021194.862B/s, avg 884.076304304MiB/s
+```
+
+## Uninstall
+```
+helm del <pvc-name>
+```
diff --git a/charts/fluid-dataloader/jindo/templates/configmap.yaml b/charts/fluid-dataloader/jindo/templates/configmap.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ printf "%s-data-load-script" .Release.Name }}
+  labels:
+    release: {{ .Release.Name }}
+    role: dataload-job
+data:
+  dataloader.jindo.init: |
+    #!/usr/bin/env bash
+    set -xe
+    jindo_env_vars=(
+      STORAGE_ADDRESS
+    )
+    function public::jindo::init_conf() {
+      local IFS=$'\n' # split by line instead of space
+      for keyvaluepair in $(env); do
+        # split around the first "="
+        key=$(echo ${keyvaluepair} | cut -d= -f1)
+        value=$(echo ${keyvaluepair} | cut -d= -f2-)
+        if [[ "${jindo_env_vars[*]}" =~ "${key}" ]]; then
+          export ${key}=\"${value}\"
+        fi
+      done
+    }
+    main() {
+      public::jindo::init_conf
+    }
+    main
+  dataloader.distributedLoad: |
+    #!/usr/bin/env bash
+    set -xe
+
+    function distributedLoad() {
+        local path=$1
+        local replica=$2
+        local default=$3
+
+        if [[ $needLoadMetadata == 'true' ]]; then
+            #echo -e "metadata cache start $default$path"
+            time jindo jfs -metaSync -R $default$path
+        else
+            echo -e "$default$path no need to cache metadata"
+        fi
+
+        if [[ $loadMemoryData == 'true' ]]; then
+            #echo -e "metadata cache start $default$path"
+            time jindo jfs -cache -s -m -r $replica $default$path
+        else
+            time jindo jfs -cache -s -r $replica $default$path
+        fi
+
+        #echo -e "distributedLoad and sleep start now"
+        #sleep 10m
+    }
+
+    function main() {
+        needLoadMetadata="$NEED_LOAD_METADATA"
+        loadMemoryData="$LOAD_MEMORY_DATA"
+        dafault="jfs://jindo"
+        paths="$DATA_PATH"
+        paths=(${paths//:/ })
+        replicas="$PATH_REPLICAS"
+        replicas=(${replicas//:/ })
+        for((i=0;i<${#paths[@]};i++)) do
+            local path="${paths[i]}"
+            local replica="${replicas[i]}"
+            echo -e "distributedLoad on $path starts"
+            distributedLoad ${paths[i]} ${replicas[i]} ${dafault}
+            #echo -e "distributedLoad on $path ends"
+        done
+    }
+
+    main "$@"
+
+
+
+
+
diff --git a/charts/fluid-dataloader/jindo/templates/dataloader.yaml b/charts/fluid-dataloader/jindo/templates/dataloader.yaml
@@ -0,0 +1,113 @@
+# .Release.Name will be used to decide which dataset will be preload
+# .Release.Name should be like `<pvc-name>-load`(e.g. hbase-load for a PersistentVolumeClaim named `hbase`)
+# TODO: the length of .Release.Name won't exceed 53(limited by Helm), which means length of `<pvc-name>` can't exceed 48. This might be a problem.
+  {{/*  {{  $datasetName := "" -}}*/}}
+  {{/*  {{- $randomSuffix := "" -}}*/}}
+  {{/*  {{- if regexMatch "^[A-Za-z0-9._-]+-load-[A-Za-z0-9]{5}$" .Release.Name -}}*/}}
+  {{/*    {{- $arr := regexSplit "-load-" .Release.Name -1 -}}*/}}
+  {{/*    {{- $datasetName = first $arr -}}*/}}
+  {{/*    {{- $randomSuffix = last $arr -}}*/}}
+  {{/*  {{- else -}}*/}}
+  {{/*    {{- printf "Illegal release name. Should be like <dataset-name>-load-<suffix-length-5>. Current name: %s" .Release.Name | fail -}}*/}}
+  {{/*  {{- end }}*/}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ printf "%s-job" .Release.Name }}
+  labels:
+    release: {{ .Release.Name }}
+    role: dataload-job
+    targetDataset: {{ required "targetDataset should be set" .Values.dataloader.targetDataset }}
+spec:
+  backoffLimit: {{ .Values.dataloader.backoffLimit | default "3" }}
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      name: {{ printf "%s-loader" .Release.Name }}
+      labels:
+        release: {{ .Release.Name }}
+        role: dataload-pod
+        targetDataset: {{ required "targetDataset should be set" .Values.dataloader.targetDataset }}
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: dataloader
+          image: {{ required "Dataloader image should be set" .Values.dataloader.image }}
+          imagePullPolicy: IfNotPresent
+          command: ["/bin/sh", "-c"]
+          args: ["/scripts/jindo_env_init.sh && /scripts/jindo_dataload.sh"]
+          {{- $targetPaths := "" }}
+          {{- range .Values.dataloader.targetPaths }}
+          {{- $targetPaths = cat $targetPaths (required "Path must be set" .path) ":" }}
+          {{- end }}
+          {{- $targetPaths = $targetPaths | nospace | trimSuffix ":" }}
+
+          {{- $pathReplicas := ""}}
+          {{- range .Values.dataloader.targetPaths }}
+          {{- $pathReplicas = cat $pathReplicas ( default 1 .replicas ) ":"}}
+          {{- end }}
+          {{- $pathReplicas = $pathReplicas | nospace | trimSuffix ":"}}
+          env:
+            - name: STORAGE_ADDRESS
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+            - name: NEED_LOAD_METADATA
+              value: {{ default false .Values.dataloader.loadMetadata | quote }}
+            - name: LOAD_MEMORY_DATA
+              value: {{ default false .Values.dataloader.loadMemoryData | quote }}
+            - name: DATA_PATH
+              value: {{ $targetPaths | quote }}
+            - name: PATH_REPLICAS
+              value: {{ $pathReplicas | quote }}
+          envFrom:
+            - configMapRef:
+                name: {{ required "targetDataset should be set" .Values.dataloader.targetDataset }}-jindofs-client-config
+          volumeMounts:
+            - name: bigboot-config
+              mountPath: /bigboot.cfg
+              subPath: bigboot.cfg
+            - name: bigboot-config
+              mountPath: /hdfs-3.2.1/etc/hadoop/core-site.xml
+              subPath: core-site.xml
+            {{- if .Values.dataloader.hdfsConfig }}
+            - name: hdfs-confs
+              mountPath: /hdfs-site.xml
+              subPath: hdfs-site.xml
+            {{- end }}
+            - mountPath: /scripts
+              name: data-load-script
+            {{- range .Values.dataloader.targetPaths }}
+            {{- if .fluidNative }}
+            - mountPath: {{ .path | trimAll "/" | replace "/" "-" | printf "/data/%s"}}
+              name: {{ .path | trimAll "/" | replace "/" "-" | printf "native-%s"}}
+            {{- end }}
+            {{- end }}
+      volumes:
+        - name: bigboot-config
+          configMap:
+            name: {{ required "targetDataset should be set" .Values.dataloader.targetDataset }}-jindofs-config
+        {{- if .Values.dataloader.hdfsConfig }}
+        - name: hdfs-confs
+          configMap:
+            name: {{ .Values.dataloader.hdfsConfig }}
+        {{- end }}
+        - name: data-load-script
+          configMap:
+            name: {{ printf "%s-data-load-script" .Release.Name }}
+            items:
+              - key: dataloader.jindo.init
+                path: jindo_env_init.sh
+                mode: 365
+              - key: dataloader.distributedLoad
+                path: jindo_dataload.sh
+                mode: 365
+        {{- range .Values.dataloader.targetPaths }}
+        {{- if .fluidNative }}
+        - name: {{ .path | trimAll "/" | replace "/" "-" | printf "native-%s"}}
+          hostPath:
+            path: {{ .path }}
+        {{- end }}
+        {{- end }}
+
diff --git a/charts/fluid-dataloader/jindo/values.yaml b/charts/fluid-dataloader/jindo/values.yaml
@@ -0,0 +1,30 @@
+# Default values for fluid-dataloader.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+dataloader:
+  # Optional
+  # Default: 3
+  # Description: how many times the prefetch job can fail, i.e. `Job.spec.backoffLimit`
+  backoffLimit: 3
+
+  # Required
+  # Description: the dataset that this DataLoad targets
+  targetDataset: #imagenet
+
+  # Optional
+  # Default: false
+  # Description: should load metadata from UFS when doing data load
+  loadMetadata: false
+
+  # Optional
+  # Default: (path: "/", replicas: 1, fluidNative: false)
+  # Description: which paths should the DataLoad load
+  targetPaths:
+    - path: "/"
+      replicas: 1
+      fluidNative: false
+
+  # Required
+  # Description: the image that the DataLoad job uses
+  image: #<jindo-image>
diff --git a/charts/fluid/fluid/crds/data.fluid.io_dataloads.yaml b/charts/fluid/fluid/crds/data.fluid.io_dataloads.yaml
@@ -61,6 +61,13 @@ spec:
               required:
               - name
               type: object
+            hdfsConfig:
+              description: add HdfsConfig for JindoRuntime
+              type: string
+            loadMemoryData:
+              description: LoadMemoryData specifies if the dataload job should load
+                memory or not
+              type: boolean
             loadMetadata:
               description: LoadMetadata specifies if the dataload job should load
                 metadata

diff --git a/charts/fluid/fluid/values.yaml b/charts/fluid/fluid/values.yaml
@@ -33,8 +33,8 @@ runtime:
     portRange: 18000-19999
     enabled: false
     smartdata:
-      image: registry.cn-shanghai.aliyuncs.com/jindofs/smartdata:3.5.2
+      image: registry.cn-shanghai.aliyuncs.com/jindofs/smartdata:3.5.0
     fuse:
-      image: registry.cn-shanghai.aliyuncs.com/jindofs/jindo-fuse:3.5.2
+      image: registry.cn-shanghai.aliyuncs.com/jindofs/jindo-fuse:3.5.0
     controller:
       image: registry.aliyuncs.com/fluid/jindoruntime-controller:v0.6.0-ed9b1be
diff --git a/charts/jindofs/Chart.yaml b/charts/jindofs/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v1
-appVersion: 3.5.2
+appVersion: 3.5.0
 description: FileSystem on the cloud based on Aliyun Object Storage aimed for data
   acceleration.
 home: https://help.aliyun.com/document_detail/164207.html
@@ -14,4 +14,4 @@ maintainers:
 - email: [email protected]
   name: Yang Che
 name: jindofs
-version: 3.5.2
+version: 3.5.0