Merge pull request openvinotoolkit#93 from nosovmik/rebase18

Rebase18
mvafin · Jun 22, 2021 · dc8f50b · dc8f50b
2 parents cca3439 + 8bbfcab
commit dc8f50b
Show file tree

Hide file tree

Showing 130 changed files with 2,906 additions and 2,026 deletions.
diff --git a/.ci/azure/linux_ngraph_onnx.yml b/.ci/azure/linux_ngraph_onnx.yml
@@ -17,6 +17,8 @@ jobs:
     WORK_DIR: $(Pipeline.Workspace)/_w
     MODELS_DIR: /mount/cinfsshare/onnxtestdata
     TMP_DIR: /mnt/tmp
+    ONNX_MODEL_ZOO_SHA: "d58213534f2a4d1c4b19ba62b3bb5f544353256e"
+
 
   steps:
   - script: |
@@ -55,7 +57,7 @@ jobs:
   - script: docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile .
     displayName: 'Docker build'
 
-  - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o
+  - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o -s "$(ONNX_MODEL_ZOO_SHA)"
     displayName: 'Get models'
 
   - script: |
@@ -77,6 +79,6 @@ jobs:
     displayName: 'Create swap'
 
   - script: |
-      docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo:/root/.onnx/model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image
+      docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo/onnx_model_zoo_$(ONNX_MODEL_ZOO_SHA):/root/.onnx/model_zoo/onnx_model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image /bin/bash -c "tox && tox -e zoo_models"
     displayName: 'Docker run'
 
diff --git a/docs/IE_DG/Migration_CoreAPI.md b/docs/IE_DG/Migration_CoreAPI.md
diff --git a/docs/IE_DG/OnnxImporterTutorial.md b/docs/IE_DG/OnnxImporterTutorial.md
diff --git a/docs/IE_DG/supported_plugins/FPGA.md b/docs/IE_DG/supported_plugins/FPGA.md
diff --git a/docs/MO_DG/img/DeepSpeech-0.8.2.png b/docs/MO_DG/img/DeepSpeech-0.8.2.png
diff --git a/docs/MO_DG/img/DeepSpeech.png b/docs/MO_DG/img/DeepSpeech.png
diff --git a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md
@@ -161,7 +161,7 @@ Where `HEIGHT` and `WIDTH` are the input images height and width for which the m
 * [GNMT](https://github.com/tensorflow/nmt) topology can be converted using [these instructions](tf_specific/Convert_GNMT_From_Tensorflow.md).
 * [BERT](https://github.com/google-research/bert) topology can be converted using [these instructions](tf_specific/Convert_BERT_From_Tensorflow.md).
 * [XLNet](https://github.com/zihangdai/xlnet) topology can be converted using [these instructions](tf_specific/Convert_XLNet_From_Tensorflow.md).
-
+* [Attention OCR](https://github.com/emedvedev/attention-ocr) topology can be converted using [these instructions](tf_specific/Convert_AttentionOCR_From_Tensorflow.md).
 
 
 ## Loading Non-Frozen Models to the Model Optimizer <a name="loading-nonfrozen-models"></a>

diff --git a/...prepare_model/convert_model/tf_specific/Convert_AttentionOCR_From_Tensorflow.md b/...prepare_model/convert_model/tf_specific/Convert_AttentionOCR_From_Tensorflow.md
@@ -0,0 +1,35 @@
+# Convert TensorFlow* Attention OCR Model to Intermediate Representation {#openvino_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_AttentionOCR_From_Tensorflow}
+
+This tutorial explains how to convert the Attention OCR (AOCR) model from the [TensorFlow* Attention OCR repository](https://github.com/emedvedev/attention-ocr) to the Intermediate Representation (IR).
+
+## Extract Model from `aocr` Library
+
+The easiest way to get an AOCR model is to download `aocr` Python\* library:
+```
+pip install git+https://github.com/emedvedev/attention-ocr.git@master#egg=aocr
+```
+This library contains a pretrained model and allows to train and run AOCR using the command line. After installing `aocr`, you can extract the model:
+```
+aocr export --format=frozengraph model/path/
+```
+After this step you can find the model in model/path/ folder.
+
+## Convert the TensorFlow* AOCR Model to IR
+
+The original AOCR model contains data preprocessing which consists of the following steps:
+* Decoding input data to binary format where input data is an image represented as a string.
+* Resizing binary image to working resolution.
+
+After that, the resized image is sent to the convolution neural network (CNN). The Model Optimizer does not support image decoding so you should cut of preprocessing part of the model using '--input' command line parameter. 
+```sh
+python3 path/to/model_optimizer/mo_tf.py \
+--input_model=model/path/frozen_graph.pb \
+--input="map/TensorArrayStack/TensorArrayGatherV3:0[1 32 86 1]" \
+--output "transpose_1,transpose_2" \
+--output_dir path/to/ir/
+```
+
+Where:
+* `map/TensorArrayStack/TensorArrayGatherV3:0[1 32 86 1]` - name of node producing tensor after preprocessing.
+* `transpose_1` - name of the node producing tensor with predicted characters.
+* `transpose_2` - name of the node producing tensor with predicted characters probabilties
diff --git a/...G/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md b/...G/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md
@@ -2,66 +2,81 @@
 
 [DeepSpeech project](https://github.com/mozilla/DeepSpeech) provides an engine to train speech-to-text models.
 
-## Download the Pre-Trained DeepSpeech Model
+## Download the Pretrained DeepSpeech Model
 
-[Pre-trained English speech-to-text model](https://github.com/mozilla/DeepSpeech#getting-the-pre-trained-model)
-is publicly available. To download the model, please follow the instruction below:
+Create a directory where model and metagraph with pretrained weights will be stored:
+```
+mkdir deepspeech
+cd deepspeech
+```
+[Pretrained English speech-to-text model](https://github.com/mozilla/DeepSpeech/releases/tag/v0.8.2) is publicly available. 
+To download the model, follow the instruction below:
 
 * For UNIX*-like systems, run the following command:
 ```
-wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz | tar xvfz -
+wget -O - https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz | tar xvfz -
+wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz | tar xvfz -
 ```
 * For Windows* systems:
-  1. Download the archive from the DeepSpeech project repository: [https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz).
-  2. Unpack it with a file archiver application.
+  1. Download the archive with the model: [https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz](https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz).
+  2. Download the TensorFlow\* MetaGraph with pretrained weights: [https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz).
+  3. Unpack it with a file archiver application.
+
+## Freeze the Model into a *.pb File
 
-After you unpack the archive with the pre-trained model, you will have the new `models` directory with the
-following files:
+After unpacking the archives above, you have to freeze the model. Note that this requires 
+TensorFlow* version 1 which is not available under Python 3.8, so you need Python 3.7 or lower.
+Before freezing, deploy a virtual environment and install the required packages:
 ```
-alphabet.txt  
-lm.binary
-output_graph.pb  
-output_graph.pbmm  
-output_graph.rounded.pb  
-output_graph.rounded.pbmm  
-trie
+virtualenv --python=python3.7 venv-deep-speech
+source venv-deep-speech/bin/activate
+cd DeepSpeech-0.8.2
+pip3 install -e .
 ```
+Freeze the model with the following command:
+```
+python3 DeepSpeech.py --checkpoint_dir ../deepspeech-0.8.2-checkpoint --export_dir ../
+```
+After that, you will get the pretrained frozen model file `output_graph.pb` in the directory `deepspeech` created at 
+the beginning. The model contains the preprocessing and main parts. The first preprocessing part performs conversion of input 
+spectrogram into a form useful for speech recognition (mel). This part of the model is not convertible into 
+IR because it contains unsupported operations `AudioSpectrogram` and `Mfcc`.
 
-Pre-trained frozen model file is `output_graph.pb`.
-
-![DeepSpeech model view](../../../img/DeepSpeech.png)
+The main and most computationally expensive part of the model converts the preprocessed audio into text. 
+There are two specificities with the supported part of the model. 
 
-As you can see, the frozen model still has two variables: `previous_state_c` and
-`previous_state_h`. It means that the model keeps training those variables at each inference.
+The first is that the model contains an input with sequence length. So the model can be converted with 
+a fixed input length shape, thus the model is not reshapeable. 
+Refer to the [Using Shape Inference](../../../../IE_DG/ShapeInference.md).
 
-At the first inference of this graph, the variables are initialized by zero tensors. After executing the `lstm_fused_cell` nodes, cell state and hidden state, which are the results of the `BlockLSTM` execution, are assigned to these two variables.
+The second is that the frozen model still has two variables: `previous_state_c` and `previous_state_h`, figure 
+with the frozen *.pb model is below. It means that the model keeps training these variables at each inference. 
 
-With each inference of the DeepSpeech graph, initial cell state and hidden state data for `BlockLSTM` is taken from previous inference from variables. Outputs (cell state and hidden state) of `BlockLSTM` are reassigned to the same variables.
+![DeepSpeech model view](../../../img/DeepSpeech-0.8.2.png)
 
-It helps the model to remember the context of the words that it takes as input.
+At the first inference the variables are initialized with zero tensors. After executing, the results of the `BlockLSTM` 
+are assigned to cell state and hidden state, which are these two variables.
 
-## Convert the TensorFlow* DeepSpeech Model to IR
+## Convert the Main Part of DeepSpeech Model into IR
 
-The Model Optimizer assumes that the output model is for inference only. That is why you should cut those variables off and resolve keeping cell and hidden states on the application level.
+Model Optimizer assumes that the output model is for inference only. That is why you should cut `previous_state_c` 
+and `previous_state_h` variables off and resolve keeping cell and hidden states on the application level.
 
 There are certain limitations for the model conversion:
 - Time length (`time_len`) and sequence length (`seq_len`) are equal.
 - Original model cannot be reshaped, so you should keep original shapes.
 
-To generate the DeepSpeech Intermediate Representation (IR), provide the TensorFlow DeepSpeech model to the Model Optimizer with the following parameters:
+To generate the IR, run the Model Optimizer with the following parameters:
 ```sh
-python3 ./mo_tf.py                                                      \
---input_model path_to_model/output_graph.pb                             \
---freeze_placeholder_with_value input_lengths->[16]                     \
---input input_node,previous_state_h/read,previous_state_c/read          \
---input_shape [1,16,19,26],[1,2048],[1,2048]                            \
---output raw_logits,lstm_fused_cell/GatherNd,lstm_fused_cell/GatherNd_1 \
+python3 {path_to_mo}/mo_tf.py                            \
+--input_model output_graph.pb                            \
+--input "input_lengths->[16],input_node[1 16 19 26],previous_state_h[1 2048],previous_state_c[1 2048]"   \
+--output "cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd_1,cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd,logits" \
 --disable_nhwc_to_nchw
 ```
 
 Where:
-* `--freeze_placeholder_with_value input_lengths->[16]` freezes sequence length
-* `--input input_node,previous_state_h/read,previous_state_c/read` and
-`--input_shape [1,16,19,26],[1,2048],[1,2048]` replace the variables with a placeholder
-* `--output raw_logits,lstm_fused_cell/GatherNd,lstm_fused_cell/GatherNd_1` gets data for the next model
-execution.
+* `input_lengths->[16]` Replaces the input node with name "input_lengths" with a constant tensor of shape [1] with a 
+  single integer value 16. This means that the model now can consume input sequences of length 16 only.
+* `input_node[1 16 19 26],previous_state_h[1 2048],previous_state_c[1 2048]` replaces the variables with a placeholder.
+* `--output ".../GatherNd_1,.../GatherNd,logits" ` output node names.
diff --git a/docs/doxygen/doxygen-ignore.txt b/docs/doxygen/doxygen-ignore.txt
@@ -22,6 +22,8 @@ inference-engine/include/vpu/vpu_config.hpp
 inference-engine/include/vpu/vpu_plugin_config.hpp
 openvino/docs/benchmarks/performance_int8_vs_fp32.md
 openvino/docs/get_started/get_started_macos.md
+openvino/docs/optimization_guide/dldt_optimization_guide.md
+openvino/docs/IE_DG/ShapeInference.md
 inference-engine/include/details/ie_so_pointer.hpp
 inference-engine/include/ie_compound_blob.h
 inference-engine/include/ie_data.h