Skip to content

Commit

Permalink
Merge branch 'master' into github_actions/local_cache_impl
Browse files Browse the repository at this point in the history
  • Loading branch information
mryzhov authored Mar 21, 2024
2 parents 6d0d8ee + a8c224f commit 6303e65
Show file tree
Hide file tree
Showing 123 changed files with 2,913 additions and 697 deletions.
90 changes: 59 additions & 31 deletions .github/scripts/collect_github_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,49 @@
import logging
import psycopg2
import dateutil
import argparse

def init_logger():
LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
logging.basicConfig(level=LOGLEVEL,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d-%Y %H:%M:%S')

def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repository-name', type=str, required=True,
help='Repository name in OWNER/REPOSITORY format')
parser.add_argument('--run-id', type=str, required=True,
help='Workflow Run ID')

return parser

def create_db_tables(conn, cur):
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_runs_test(
id SERIAL,
run_id BIGINT PRIMARY KEY,
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_runs(
id SERIAL PRIMARY KEY,
run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
run_started_at TIMESTAMP,
created_at TIMESTAMP,
updated_at TIMESTAMP,
triggering_actor_login VARCHAR(255),
conclusion VARCHAR(25),
run_number INT,
event VARCHAR(50),
run_attempt INT,
repository_full_name VARCHAR(255),
head_repository_full_name VARCHAR(255),
head_branch VARCHAR(255),
status VARCHAR(25),
display_title TEXT,
path TEXT
path TEXT,
total_duration_seconds INT
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_jobs_test(
id SERIAL,
job_id BIGINT PRIMARY KEY,
parent_run_id BIGINT REFERENCES github_workflow_runs_test(run_id),
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_jobs(
id SERIAL PRIMARY KEY,
job_id BIGINT,
parent_run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
created_at TIMESTAMP,
Expand All @@ -47,12 +59,14 @@ def create_db_tables(conn, cur):
runner_name VARCHAR(255),
status VARCHAR(25),
conclusion VARCHAR(25),
head_branch VARCHAR(255)
head_branch VARCHAR(255),
run_attempt INT,
workflow_name TEXT
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_steps_test(
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_steps(
id SERIAL PRIMARY KEY,
parent_job_id BIGINT REFERENCES github_workflow_jobs_test(job_id),
parent_job_id BIGINT,
name VARCHAR(255),
conclusion VARCHAR(25),
number INT,
Expand All @@ -65,20 +79,16 @@ def create_db_tables(conn, cur):

def main():
init_logger()

parser = make_parser()
args = parser.parse_args()
logger = logging.getLogger(__name__)

github_token = os.environ.get('GITHUB_TOKEN')
if not github_token:
raise ValueError('GITHUB_TOKEN environment variable is not set!')

run_id = os.environ.get('RUN_ID')
if not run_id:
raise ValueError('RUN_ID environment variable is not set!')

repo_name = os.environ.get('GITHUB_REPOSITORY')
if not repo_name:
raise ValueError('GITHUB_REPOSITORY environment variable is not set!')
run_id = args.run_id
repo_name = args.repository_name


# this should be specified in runner's env
Expand All @@ -102,18 +112,30 @@ def main():
repo = g.get_repo(repo_name)

run = repo.get_workflow_run(int(run_id))

workflow_data_query = f'''INSERT INTO github_workflow_runs_test(
if run.status != 'completed':
logger.error('Run %s is not completed! Only completed runs should be in the database', run_id)
raise SystemExit(1)

# We rely on the following assumptions:
# - The workflow run is completed. When run.status != 'completed' we should not add it to the database
# theoretically the second attempt can be triggerred right after the completion of the first one
# or while the runner which executes this script is deploying
#
# - Job's queued duration equals "job.started_at - job.created_at" if started_at > created_at.
# Otherwise the job should not be added to the database
total_duration_seconds = round(run.timing().run_duration_ms / 1000)
workflow_data_query = f'''INSERT INTO workflow_runs(
run_id, html_url, name,
run_started_at, triggering_actor_login, conclusion,
run_number, event, run_attempt, repository_full_name,
head_branch, display_title, path)
run_started_at, created_at, updated_at, triggering_actor_login, conclusion,
event, run_attempt, repository_full_name,
head_branch, display_title, path, total_duration_seconds)
VALUES(
'{run_id}', '{run.html_url}', '{run.name}', '{run.run_started_at}',
'{run.created_at}', '{run.updated_at}',
'{run.raw_data['triggering_actor']['login']}',
'{run.conclusion}', '{run.run_number}', '{run.event}',
'{run.conclusion}', '{run.event}',
'{run.run_attempt}', '{run.raw_data['repository']['full_name']}',
'{run.head_branch}', '{run.display_title}', '{run.path}'
'{run.head_branch}', '{run.display_title}', '{run.path}', '{total_duration_seconds}'
);
'''

Expand All @@ -126,6 +148,10 @@ def main():
duration_seconds = 0

job_created_at_date = dateutil.parser.parse(job.raw_data['created_at'])
if job_created_at_date > job.started_at:
logger.warning('Skipping job %s of run %s - most likely a stub \
job created after workflow restart', job.name, run_id)
continue

queued_duration_timedelta = job.started_at - job_created_at_date
queued_duration_seconds = round(queued_duration_timedelta.total_seconds())
Expand All @@ -134,17 +160,19 @@ def main():
duration_seconds = round(duration_timedelta.total_seconds())

job_data_query = f'''
INSERT INTO github_workflow_jobs_test(
INSERT INTO workflow_jobs(
job_id, parent_run_id, html_url, name,
created_at, started_at, completed_at,
queued_duration_seconds, duration_seconds,
runner_name, status, conclusion, head_branch)
runner_name, status, conclusion, head_branch,
run_attempt, workflow_name
)
VALUES(
'{job_id}', '{run_id}', '{job.html_url}', '{job.name}',
'{job.raw_data['created_at']}', '{job.started_at}', '{job.completed_at}',
'{queued_duration_seconds}', '{duration_seconds}',
'{job.raw_data['runner_name']}', '{job.status}', '{job.conclusion}',
'{job.raw_data['head_branch']}'
'{job.raw_data['head_branch']}', '{job.raw_data['run_attempt']}', '{job.raw_data['workflow_name']}'
);
'''
logger.debug('Job query: %s', job_data_query)
Expand All @@ -154,7 +182,7 @@ def main():
duration_seconds = round(duration_seconds_timedelta.total_seconds())

step_data_query = f'''
INSERT INTO github_workflow_steps_test(
INSERT INTO workflow_steps(
parent_job_id, name, conclusion,
number, started_at, completed_at,
duration_seconds)
Expand Down
6 changes: 1 addition & 5 deletions .github/workflows/code_snippets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,5 @@ jobs:
- name: CMake configure
run: cmake -DCMAKE_BUILD_TYPE=Release -DTHREADING=SEQ -B build

- name: Get number of CPU cores
uses: SimenB/github-actions-cpu-cores@v2
id: cpu-cores

- name: Build snippets
run: cmake --build build --target openvino_docs_snippets --parallel ${{ steps.cpu-cores.outputs.count }}
run: cmake --build build --target openvino_docs_snippets --parallel
6 changes: 0 additions & 6 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@ jobs:
python3 -m pip install -r ${{ github.workspace }}/tools/mo/requirements_tf2.txt
python3 -m pip install -r ${{ github.workspace }}/tools/mo/requirements_dev.txt
- name: Get number of CPU cores
uses: SimenB/github-actions-cpu-cores@v2
id: cpu-cores

- name: Build OpenVINO with CMake
uses: ashutoshvarma/action-cmake-build@master
with:
Expand All @@ -81,7 +77,6 @@ jobs:
-DCMAKE_CXX_LINKER_LAUNCHER=ccache
-DENABLE_SYSTEM_SNAPPY=ON
build-type: Release
parallel: ${{ steps.cpu-cores.outputs.count }}

- name: Install wheel packages
run: cmake -DCOMPONENT=python_wheels -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install_pkg -P '${{ github.workspace }}/build/cmake_install.cmake'
Expand Down Expand Up @@ -129,7 +124,6 @@ jobs:
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-DCMAKE_C_LINKER_LAUNCHER=ccache
-DCMAKE_CXX_LINKER_LAUNCHER=ccache
parallel: ${{ steps.cpu-cores.outputs.count }}
- name: Print info
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/send_workflows_to_opentelemetry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ jobs:
- name: Install deps
run: |
pip3 install -r .github/scripts/requirements.txt
# dependency review action has these as an exception
# yet it still complains, so install them here
pip3 install PyGithub==2.2.0 psycopg2-binary==2.9.9
- name: Send metrics to SQL database
Expand All @@ -58,6 +60,9 @@ jobs:
PGHOST: ${{ secrets.METRICS_DATABASE_HOST }}
PGUSER: ${{ secrets.METRICS_DATABASE_USERNAME }}
PGPASSWORD: ${{ secrets.METRICS_DATABASE_PASSWORD }}
PGDATABASE: ${{ secrets.METRICS_DATABASE_NAME }}
PGPORT: 5432
run: |
python3 .github/scripts/collect_github_metrics.py
python3 .github/scripts/collect_github_metrics.py \
--run-id ${{ github.event.workflow_run.id }} \
--repository-name ${GITHUB_REPOSITORY}
Original file line number Diff line number Diff line change
Expand Up @@ -23,44 +23,43 @@ Optimize Inference
optimizations that can be done independently. Inference
speed depends on latency and throughput.


Runtime optimization, or deployment optimization, focuses on tuning inference parameters and execution means (e.g., the optimum number of requests executed simultaneously). Unlike model-level optimizations, they are highly specific to the hardware and case they are used for, and often come at a cost.
``ov::hint::inference_precision`` is a "typical runtime configuration" which trades accuracy for performance, allowing ``fp16/bf16`` execution for the layers that remain in ``fp32`` after quantization of the original ``fp32`` model.

Therefore, optimization should start with defining the use case. For example, if it is about processing millions of samples by overnight jobs in data centers, throughput could be prioritized over latency. On the other hand, real-time usages would likely trade off throughput to deliver the results at minimal latency. A combined scenario is also possible, targeting the highest possible throughput, while maintaining a specific latency threshold.

It is also important to understand how the full-stack application would use the inference component "end-to-end." For example, to know what stages need to be orchestrated to save workload devoted to fetching and preparing input data.

For more information on this topic, see the following articles:

* :doc:`Supported Devices <../../about-openvino/compatibility-and-support/supported-devices>`
* :doc:`Inference Devices and Modes <inference-devices-and-modes>`
* :ref:`Inputs Pre-processing with the OpenVINO <inputs_pre_processing>`
* :ref:`Async API <async_api>`
* :ref:`The 'get_tensor' Idiom <tensor_idiom>`
* For variably-sized inputs, consider :doc:`dynamic shapes <dynamic-shapes>`


See the :doc:`latency <optimize-inference/optimizing-latency>` and :doc:`throughput <optimize-inference/optimizing-throughput>` optimization guides, for **use-case-specific optimizations**

Writing Performance-Portable Inference Applications
###################################################

Although inference performed in OpenVINO Runtime can be configured with a multitude of low-level performance settings, it is not recommended in most cases. Firstly, achieving the best performance with such adjustments requires deep understanding of device architecture and the inference engine.


Secondly, such optimization may not translate well to other device-model combinations. In other words, one set of execution parameters is likely to result in different performance when used under different conditions. For example:

* both the CPU and GPU support the notion of :doc:`streams <./optimize-inference/optimizing-throughput/advanced_throughput_options>`, yet they deduce their optimal number very differently.
* Even among devices of the same type, different execution configurations can be considered optimal, as in the case of instruction sets or the number of cores for the CPU and the batch size for the GPU.
* Different models have different optimal parameter configurations, considering factors such as compute vs memory-bandwidth, inference precision, and possible model quantization.
* Execution "scheduling" impacts performance strongly and is highly device-specific, for example, GPU-oriented optimizations like batching, combining multiple inputs to achieve the optimal throughput, :doc:`do not always map well to the CPU <optimize-inference/optimizing-low-level-implementation>`.


To make the configuration process much easier and its performance optimization more portable, the option of :doc:`Performance Hints <optimize-inference/high-level-performance-hints>` has been introduced. It comprises two high-level "presets" focused on either **latency** or **throughput** and, essentially, makes execution specifics irrelevant.

The Performance Hints functionality makes configuration transparent to the application, for example, anticipates the need for explicit (application-side) batching or streams, and facilitates parallel processing of separate infer requests for different input sources

Runtime, or deployment optimization focuses on tuning inference and execution parameters. Unlike
model-level optimization, it is highly specific to the hardware you use and the goal you want
to achieve. You need to plan whether to prioritize accuracy or performance,
:doc:`throughput <optimize-inference/optimizing-throughput>` or :doc:`latency <optimize-inference/optimizing-latency>`,
or aim at the golden mean. You should also predict how scalable your application needs to be
and how exactly it is going to work with the inference component. This way, you will be able
to achieve the best results for your product.

.. note::

For more information on this topic, see the following articles:

* :doc:`Inference Devices and Modes <inference-devices-and-modes>`
* :ref:`Inputs Pre-processing with the OpenVINO <inputs_pre_processing>`
* :ref:`Async API <async_api>`
* :ref:`The 'get_tensor' Idiom <tensor_idiom>`
* For variably-sized inputs, consider :doc:`dynamic shapes <dynamic-shapes>`

Performance-Portable Inference
################################

To make configuration easier and performance optimization more portable, OpenVINO offers the
:doc:`Performance Hints <optimize-inference/high-level-performance-hints>` feature. It comprises
two high-level “presets” focused on latency **(default)** or throughput.

Although inference with OpenVINO Runtime can be configured with a multitude
of low-level performance settings, it is not recommended, as:

* It requires deep understanding of device architecture and the inference engine.
* It may not translate well to other device-model combinations. For example:

* CPU and GPU deduce their optimal number of streams differently.
* Different devices of the same type, favor different execution configurations.
* Different models favor different parameter configurations (e.g., compute vs memory-bandwidth,
inference precision, and possible model quantization).
* Execution “scheduling” impacts performance strongly and is highly device specific. GPU-oriented
optimizations :doc:`do not always map well to the CPU <optimize-inference/optimizing-low-level-implementation>`.

Additional Resources
####################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ The hints, in contrast, respect the actual model, so the parameters for optimal
Performance Hints: Latency and Throughput
#########################################

As discussed in the :doc:`Optimization Guide <../optimize-inference>` there are a few different metrics associated with inference speed. Throughput and latency are some of the most widely used metrics that measure the overall performance of an application.
As discussed in the :doc:`Optimization Guide <../optimize-inference>` there are a few different metrics associated with inference speed. Latency and throughput are some of the most widely used metrics that measure the overall performance of an application.

Therefore, in order to ease the configuration of the device, OpenVINO offers two dedicated hints, namely ``ov::hint::PerformanceMode::THROUGHPUT`` and ``ov::hint::PerformanceMode::LATENCY``.
Therefore, in order to ease the configuration of the device, OpenVINO offers two dedicated hints, namely ``ov::hint::PerformanceMode::LATENCY`` **(default)** and ``ov::hint::PerformanceMode::THROUGHPUT``.

For more information on conducting performance measurements with the ``benchmark_app``, refer to the last section in this document.

Expand Down
6 changes: 6 additions & 0 deletions src/bindings/js/node/include/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,11 @@ class CompiledModelWrap : public Napi::ObjectWrap<CompiledModelWrap> {
Napi::Value export_model(const Napi::CallbackInfo& info);

private:
/** @brief Gets node of a compiled model specified in CallbackInfo. */
Napi::Value get_node(const Napi::CallbackInfo& info,
const ov::Output<const ov::Node>& (ov::CompiledModel::*func)() const,
const ov::Output<const ov::Node>& (ov::CompiledModel::*func_tname)(const std::string&)const,
const ov::Output<const ov::Node>& (ov::CompiledModel::*func_idx)(size_t) const);

ov::CompiledModel _compiled_model;
};
Loading

0 comments on commit 6303e65

Please sign in to comment.