From 8496592f962ab1ae5f31da02ebe866352ffbece0 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 6 Oct 2023 12:47:47 +0200
Subject: [PATCH] tiny SD fixed

---
 tools/ovc/openvino/tools/ovc/__init__.py      |  2 +-
 .../ovc/partially_upcast_nodes_to_fp32.py     | 48 +++++++++++--------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/tools/ovc/openvino/tools/ovc/__init__.py b/tools/ovc/openvino/tools/ovc/__init__.py
index 0f409e606272c5..afbdc53bda363c 100644
--- a/tools/ovc/openvino/tools/ovc/__init__.py
+++ b/tools/ovc/openvino/tools/ovc/__init__.py
@@ -2,4 +2,4 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from openvino.tools.ovc.convert import convert_model
-from openvino.tools.ovc.partially_upcast_nodes_to_fp32 import partially_upcast_nodes_to_fp32
\ No newline at end of file
+from openvino.tools.ovc.partially_upcast_nodes_to_fp32 import partially_upcast_nodes_to_fp32, inject_to_partially_upcast_nodes_to_fp32
\ No newline at end of file
diff --git a/tools/ovc/openvino/tools/ovc/partially_upcast_nodes_to_fp32.py b/tools/ovc/openvino/tools/ovc/partially_upcast_nodes_to_fp32.py
index 4366ed4921e255..51794d7a08cbb6 100644
--- a/tools/ovc/openvino/tools/ovc/partially_upcast_nodes_to_fp32.py
+++ b/tools/ovc/openvino/tools/ovc/partially_upcast_nodes_to_fp32.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Union, Tuple
+from typing import List, Dict, Union, Tuple, Callable
 
 import numpy as np
 from openvino._pyopenvino import Node
@@ -12,6 +12,7 @@
 ops_to_track_map = {
     'Convolution': convolution,
     'MatMul': matmul,
+    # todo: implement for some other ops
     # 'ReduceSum': reduce_sum,
     # 'ReduceMean': reduce_mean,
     # 'ReduceProd': reduce_prod,
@@ -20,25 +21,34 @@
 }
 
 thresholds_per_op = {
-    'Convolution': (0.1, 0.02),
-    'MatMul': (0.1, 0.05),
+    'Convolution': (0.1, 0.003, 0.00),
+    'MatMul': (0.1, 0.04, 0.03),
 }
 
 
+def inject_to_partially_upcast_nodes_to_fp32(orig) -> Callable:  # orig type is OVModelForCausalLM
+    def new_start_async(inputs, shared_memory):
+        new_model = partially_upcast_nodes_to_fp32(orig.model, inputs)
+        orig.model = new_model
+        orig.request = None
+        orig.compile()  # compile will set orig.request for OVModelForCausalLM
+        orig.request.start_async(inputs, shared_memory=shared_memory)
+    return new_start_async
+
+
 def partially_upcast_nodes_to_fp32(orig_model: Model, example_input: Union[List, Dict]) -> Model:
-    model = orig_model.clone()  # todo: check if need to copy orig_models
+    model = orig_model.clone()  # todo: check if need to clone orig_models
     nodes_to_track, outs_to_track, _ = insert_results_for_tracked_ops(model)
     fp16_full_net_infer_values = infer_full_net_in_fp16(nodes_to_track, model, example_input)
     fp16_infer_values = infer_nodes_in_fp16(nodes_to_track, fp16_full_net_infer_values)
     fp32_infer_values = infer_nodes_in_fp32(nodes_to_track, fp16_full_net_infer_values)
-    # del model
+    del model
     new_model = orig_model.clone()
     mark_nodes_to_upcast_to_fp32(new_model, nodes_to_track, fp16_infer_values, fp32_infer_values)
-    # todo: if no copy then need to restore original outputs
     return new_model
 
 
-def insert_results_for_tracked_ops(model) -> (Dict, List, List):
+def insert_results_for_tracked_ops(model) -> (List, List, List):
     # additional outputs to track inputs and output values of operations of interest
     nodes_to_track = []
     outputs = []
@@ -116,12 +126,14 @@ def infer_nodes_in_fp32(nodes_to_track: List[Node], node_data_values: List[Tuple
         results.append(infer_tracked_op_on_gpu(node, value[1:]))
     return results
 
+
 def infer_nodes_in_fp16(nodes_to_track: List[Node], node_data_values: List[Tuple]) -> List:
     results = []
     for node, value in zip(nodes_to_track, node_data_values):
         results.append(infer_tracked_op_on_gpu(node, value[1:], precision='f16'))
     return results
 
+
 def infer_tracked_op_on_gpu(op: Node, input_vals: Tuple, precision='f32') -> np.ndarray:
     parameters = []
     for input_val in input_vals:
@@ -143,7 +155,7 @@ def infer_tracked_op_on_gpu(op: Node, input_vals: Tuple, precision='f32') -> np.
     return result[0]
 
 
-def mark_nodes_to_upcast_to_fp32(model: Model, nodes: List[Node], fp16_infer_vals: List, fp32_infer_vals: List) -> List[Node]:
+def mark_nodes_to_upcast_to_fp32(model: Model, nodes: List[Node], fp16_infer_vals: List, fp32_infer_vals: List) -> None:
     nodes_with_errors = []
     for node, fp16_val, fp32_val in zip(nodes, fp16_infer_vals, fp32_infer_vals):
         if compare_tensors(node, fp16_val, fp32_val):
@@ -151,14 +163,8 @@ def mark_nodes_to_upcast_to_fp32(model: Model, nodes: List[Node], fp16_infer_val
     
     for node in model.get_ordered_ops():
         if node.get_friendly_name() in nodes_with_errors:
-            # todo: for tiny SD this does not work because of xxx-122082
             node.get_rt_info()['disable_fp16_compression_0'] = ''
 
-    from openvino.runtime.passes import VisualizeTree, Manager
-    # manager = Manager()
-    # manager.register_pass(VisualizeTree("upcasted.svg"))
-    # manager.run_passes(model)
-
 
 def compare_tensors(node: Node, a: np.ndarray, b: np.ndarray) -> bool:
     """
@@ -174,13 +180,15 @@ def compare_tensors(node: Node, a: np.ndarray, b: np.ndarray) -> bool:
         rel_error = np.abs(2 * (a_ - b_) / (np.abs(a_) + abs(b_)))
 
     mean_rel_error = np.mean(rel_error)
-    rel_tol = 0.03  # check if necessary to move to a threshold dict
-    if mean_rel_error < rel_tol:
-        return False
+    
+    thresholds = thresholds_per_op[node.get_type_name()]
+    rel_threshold = thresholds[0]
+    rel_threshold_ratio = thresholds[1]
+    rel_tol = thresholds[2]
 
-    rel_threshold = thresholds_per_op[node.get_type_name()][0]
-    rel_threshold_ratio = thresholds_per_op[node.get_type_name()][1]
     rel_diff_ratio = np.size(np.where(rel_error >= rel_threshold)) / out_size
+    if mean_rel_error < rel_tol:
+        return False
     if rel_diff_ratio > rel_threshold_ratio:
-        print(f'upcasted node {node.get_friendly_name()} with 0.1 rel2_diff_ratio {rel_diff_ratio}')
+        print(f'upcasted node {node.get_friendly_name()} with 0.1 rel2_diff_ratio {rel_diff_ratio} and mean_rel_error {mean_rel_error}')
         return True