You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
^
/tmp/tmpeilktith/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpeilktith/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpeilktith/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpeilktith/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpeilktith/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmptf4cg5_c/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmptf4cg5_c/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmptf4cg5_c/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmptf4cg5_c/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmptf4cg5_c/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp9utu3rwt/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmp9utu3rwt/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp9utu3rwt/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmp9utu3rwt/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmp9utu3rwt/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpfwwxs7d8/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpfwwxs7d8/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpfwwxs7d8/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpfwwxs7d8/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpfwwxs7d8/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpsomyu47p/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpsomyu47p/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpsomyu47p/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpsomyu47p/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpsomyu47p/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpul3l8_vy/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpul3l8_vy/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpul3l8_vy/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpul3l8_vy/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpul3l8_vy/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp_25p7zmi/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmp_25p7zmi/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp_25p7zmi/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmp_25p7zmi/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmp_25p7zmi/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpaqkjsyxw/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpaqkjsyxw/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpaqkjsyxw/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpaqkjsyxw/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpaqkjsyxw/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:170, in SamAutomaticMaskGenerator.generate(self, image)
145 """
146 Generates masks for the given image.
147
(...)
166 the mask, given in XYWH format.
167 """
169 # Generate masks
--> 170 mask_data = self._generate_masks(image)
172 # Filter small disconnected regions and holes in masks
173 if self.min_mask_region_area > 0:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:213, in SamAutomaticMaskGenerator._generate_masks(self, image)
211 data = MaskData()
212 for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
--> 213 crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
214 data.cat(crop_data)
216 # Remove duplicate masks between crops
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:243, in SamAutomaticMaskGenerator._process_crop(self, image, crop_box, crop_layer_idx, orig_size)
241 cropped_im = image[y0:y1, x0:x1, :]
242 cropped_im_size = cropped_im.shape[:2]
--> 243 self.predictor.set_image(cropped_im)
245 # Get points for this crop
246 points_scale = np.array(cropped_im_size)[None, ::-1]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:527, in _compile..transform(instructions, code_options)
525 try:
526 with tracing(tracer.output.tracing_context), tracer.set_current_tx():
--> 527 tracer.run()
528 except exc.UnspecializeRestartAnalysis:
529 speculation_log.clear()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:2128, in InstructionTranslator.run(self)
2127 def run(self):
-> 2128 super().run()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:818, in InstructionTranslatorBase.run(self)
813 try:
814 self.output.push_tx(self)
815 while (
816 self.instruction_pointer is not None
817 and not self.output.should_exit
--> 818 and self.step()
819 ):
820 pass
821 except BackendCompilerFailed:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/inductor/compile_fx.py:952, in compile_fx(model, example_inputs_, inner_compile, config_patches, decompositions)
950 if config_patches:
951 with config.patch(config_patches):
--> 952 return compile_fx(
953 model_,
954 example_inputs_,
955 # need extra layer of patching as backwards is compiled out of scope
956 inner_compile=config.patch(config_patches)(inner_compile),
957 decompositions=decompositions,
958 )
960 if config.cpp_wrapper:
961 with config.patch(
962 {
963 "cpp_wrapper": False,
(...)
967 }
968 ), V.set_real_inputs(example_inputs_):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/backends/common.py:55, in aot_autograd..compiler_fn(gm, example_inputs)
52 try:
53 # NB: NOT cloned!
54 with enable_aot_logging(), patch_config:
---> 55 cg = aot_module_simplified(gm, example_inputs, **kwargs)
56 counters["aot_autograd"]["ok"] += 1
57 return disable(cg)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/aot_autograd.py:887, in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)
871 aot_config = AOTConfig(
872 fw_compiler=fw_compiler,
873 bw_compiler=bw_compiler,
(...)
883 no_tangents=False,
884 )
886 with compiled_autograd.disable():
--> 887 compiled_fn = create_aot_dispatcher_function(
888 functional_call,
889 full_args,
890 aot_config,
891 )
893 # TODO: There is something deeply wrong here; compiled_fn running with
894 # the boxed calling convention, but aot_module_simplified somehow
895 # historically returned a function that was not the boxed calling
896 # convention. This should get fixed...
897 def forward(*runtime_args):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/aot_autograd.py:600, in create_aot_dispatcher_function(flat_fn, flat_args, aot_config)
597 compiler_fn = partial(aot_wrapper_dedupe, compiler_fn=compiler_fn)
598 # You can put more passes here
--> 600 compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
601 if aot_config.is_export:
602 mutated_user_inp_locs = [
603 idx - aot_config.num_params_buffers
604 for idx in fw_metadata.mutated_inp_runtime_indices
605 if idx >= aot_config.num_params_buffers
606 ]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425, in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)
422 break
424 if ok:
--> 425 return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
427 if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
428 raise RuntimeError(
429 """
430 Encountered duplicate inputs that are mutated in the graph, but at least one input/output
431 to the graph is a tensor subclass. This is not supported today. You can try to
432 remove the aliasing yourself as a workaround, or otherwise file an issue on github."""
433 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630, in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)
628 # Happy path: we don't need synthetic bases
629 if synthetic_base_info is None:
--> 630 return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
632 # export path: ban synthetic bases for now, add later if requested.
633 if requires_subclass_dispatch(flat_args, fw_metadata):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:97, in aot_dispatch_base(flat_fn, flat_args, aot_config, fw_metadata)
91 if tracing_context := torch._guards.TracingContext.try_get():
92 tracing_context.fw_metadata = (
93 fw_metadata
94 if maybe_subclass_meta is None
95 else maybe_subclass_meta.fw_metadata
96 )
---> 97 compiled_fw = compiler(fw_module, updated_flat_args)
99 # This boxed_call handling happens inside create_runtime_wrapper as well.
100 # However, create_runtime_wrapper does not expect the rng offsets in the
101 # output. So, we have to create another wrapper and take out the offset. As
102 # a result, we have to account for not boxed_call compilers as well.
103 if not hasattr(compiled_fw, "_boxed_call"):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1100, in compile_fx..fw_compiler_base(model, example_inputs, is_inference)
1092 assert orig_output_end_idx <= num_model_outputs
1094 user_visible_outputs = {
1095 n.name
1096 for n in model_outputs[original_output_start_index:orig_output_end_idx]
1097 if isinstance(n, torch.fx.Node)
1098 }
-> 1100 return inner_compile(
1101 model,
1102 example_inputs,
1103 num_fixed=fixed,
1104 cudagraphs=cudagraphs,
1105 graph_id=graph_id,
1106 is_inference=is_inference,
1107 boxed_forward_device_index=forward_device,
1108 user_visible_outputs=user_visible_outputs,
1109 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/contextlib.py:81, in ContextDecorator.call..inner(*args, **kwds)
78 @wraps(func)
79 def inner(*args, **kwds):
80 with self._recreate_cm():
---> 81 return func(*args, **kwds)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/repro/after_aot.py:83, in wrap_compiler_debug..debug_wrapper(gm, example_inputs, **kwargs)
78 assert config.repro_after in ("dynamo", "aot", None)
80 try:
81 # Call the compiler_fn - which is either aot_autograd or inductor
82 # with fake inputs
---> 83 inner_compiled_fn = compiler_fn(gm, example_inputs)
84 except Exception as e:
85 # TODO: Failures here are troublesome because no real inputs,
86 # need a different serialization strategy
87 if config.repro_after == "aot":
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/debug.py:305, in DebugContext.wrap..inner(*args, **kwargs)
302 @functools.wraps(fn)
303 def inner(*args, **kwargs):
304 with DebugContext():
--> 305 return fn(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/contextlib.py:81, in ContextDecorator.call..inner(*args, **kwds)
78 @wraps(func)
79 def inner(*args, **kwds):
80 with self._recreate_cm():
---> 81 return func(*args, **kwds)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:320, in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)
316 compiled_graph = FxGraphCache.load(
317 fx_codegen_and_compile, gm, example_inputs, graph_kwargs
318 )
319 else:
--> 320 compiled_graph = fx_codegen_and_compile(
321 gm, example_inputs, **graph_kwargs # type: ignore[arg-type]
322 )
324 log.debug("FX codegen and compilation took %.3fs", time.time() - start)
326 # Return the output strides to the caller via TracingContext
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:535, in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)
519 graph = GraphLowering(
520 gm,
521 # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
(...)
532 is_inference=is_inference,
533 )
534 with V.set_graph_handler(graph):
--> 535 graph.run(*example_inputs)
536 output_strides: List[Optional[Tuple[int, ...]]] = []
537 if graph.graph_outputs is not None:
538 # We'll put the output strides in the compiled graph so we
539 # can later return them to the caller via TracingContext
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/fx/interpreter.py:138, in Interpreter.run(self, initial_env, enable_io_processing, *args)
135 continue
137 try:
--> 138 self.env[node] = self.run_node(node)
139 except Exception as e:
140 if self.extra_traceback:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:814, in GraphLowering.run_node(self, n)
812 debug("layout_constraints")
813 args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
--> 814 result = self.call_function(n.target, args, kwargs)
815 elif is_magic_method(n.target):
816 # TODO: this is sus, it probably should be handled in the
817 # lowerings themselves similarly to sym_size/sym-stride
818 debug("is_magic_method")
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:694, in GraphLowering.call_function(self, target, args, kwargs)
692 return out
693 except Exception as e:
--> 694 raise LoweringException(e, target, args, kwargs).with_traceback(
695 e.traceback
696 ) from None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:691, in GraphLowering.call_function(self, target, args, kwargs)
689 try:
690 log.debug(" via %s", lowerings[target])
--> 691 out = lowerings[target](*args, **kwargs)
692 return out
693 except Exception as e:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/lowering.py:291, in _register_lowering..wrapped(*args, **kwargs)
288 if unpacked:
289 args = [args]
--> 291 out = decomp_fn(*args, **kwargs)
292 validate_ir(out)
294 return out
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py:367, in convolution(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)
363 return convert_1x1_conv_to_mm(x, weight, bias)
365 if bias is not None and ir.get_device_type(x) != "cpu":
366 # peel off the bias, cudnn is slower with it
--> 367 result = convolution(x, weight, None, **kwargs)
368 return L[aten.add](
369 result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1])
370 )
372 x.realize()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/lowering.py:291, in _register_lowering..wrapped(*args, **kwargs)
288 if unpacked:
289 args = [args]
--> 291 out = decomp_fn(*args, **kwargs)
292 validate_ir(out)
294 return out
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:991, in autotune_select_algorithm(*args, **kwargs)
989 if _ALGORITHM_SELECTOR_CACHE is None:
990 _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
--> 991 return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:748, in AlgorithmSelectorCache.call(self, name, choices, input_nodes, layout, input_gen_fns)
745 tuning_pool.initialize()
747 autotune_start_ts = time.time()
--> 748 timings = self.lookup(
749 choices,
750 name,
751 repr([self.key_of(x) for x in input_nodes]),
752 autotune,
753 )
754 autotune_elapse = time.time() - autotune_start_ts
755 if timings == {} or choices[0] not in timings:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/codecache.py:291, in PersistentCache.lookup(self, choices, name, inputs, benchmark)
285 if not check_cache(local_cache) and not (
286 use_global_cache()
287 and check_cache(self.get_global_cache(), callback=log_stats)
288 ):
289 try:
290 # re-benchmark everything to try to get consistent numbers from the same machine
--> 291 timings = benchmark(choices)
292 assert all(choice in timings for choice in choices)
294 local_cache.setdefault(name, {})
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:848, in AlgorithmSelectorCache.make_benchmark_fn..benchmark_in_current_process(choices)
846 for choice in choices:
847 try:
--> 848 timing = benchmark_choice_in_current_process(choice)
849 except CUDACompileError as e:
850 log.warning(
851 "CUDA compilation error: \n%s. \nIgnore this choice.", str(e)
852 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:838, in AlgorithmSelectorCache.make_benchmark_fn..benchmark_choice_in_current_process(choice)
835 result = choice.benchmark(*example_inputs_extern, out=out_extern)
836 else:
837 # triton templates want the base pointer for sliced tensors
--> 838 result = choice.benchmark(*example_inputs, out=out)
839 if VERIFY:
840 torch.testing.assert_close(out_extern, expected, **VERIFY)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:604, in TritonTemplateCaller.benchmark(self, out, *args)
602 def benchmark(self, *args, out):
603 assert self.bmreq is not None
--> 604 return self.bmreq.benchmark(*args, output_tensor=out)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/autotune_process.py:452, in BenchmarkRequest.benchmark(self, output_tensor, *input_tensors)
449 load_elapse = time.time() - start_ts
450 start_ts = time.time()
--> 452 out = do_bench(fn)
453 torch.cuda.synchronize() # shake out any CUDA errors
455 if debug:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/utils.py:167, in do_bench(*args, **kwargs)
165 if quantile_field_name not in kwargs:
166 kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
--> 167 return triton_do_bench(*args, **kwargs)[0]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/testing.py:102, in do_bench(fn, warmup, rep, grad_to_none, quantiles, fast_flush, return_mode)
83 import torch
84 """
85 Benchmark the runtime of the provided function. By default, return the median runtime of :code:fn along with
86 the 20-th and 80-th performance percentile.
(...)
99 :type fast_flush: bool
100 """
--> 102 fn()
103 torch.cuda.synchronize()
105 # We maintain a buffer of 256 MB that we clear
106 # before each kernel call to make sure that the L2
107 # doesn't contain any input data before the run
^
/tmp/tmpeilktith/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpeilktith/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpeilktith/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpeilktith/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpeilktith/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmptf4cg5_c/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmptf4cg5_c/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmptf4cg5_c/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmptf4cg5_c/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmptf4cg5_c/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp9utu3rwt/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmp9utu3rwt/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp9utu3rwt/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmp9utu3rwt/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmp9utu3rwt/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpfwwxs7d8/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpfwwxs7d8/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpfwwxs7d8/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpfwwxs7d8/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpfwwxs7d8/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpsomyu47p/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpsomyu47p/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpsomyu47p/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpsomyu47p/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpsomyu47p/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpul3l8_vy/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpul3l8_vy/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpul3l8_vy/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpul3l8_vy/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpul3l8_vy/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp_25p7zmi/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmp_25p7zmi/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmp_25p7zmi/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmp_25p7zmi/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmp_25p7zmi/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpaqkjsyxw/main.c: In function ‘list_to_cuuint64_array’:
/tmp/tmpaqkjsyxw/main.c:354:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
/tmp/tmpaqkjsyxw/main.c:354:3: note: use option -std=c99 or -std=gnu99 to compile your code
/tmp/tmpaqkjsyxw/main.c: In function ‘list_to_cuuint32_array’:
/tmp/tmpaqkjsyxw/main.c:365:3: error: ‘for’ loop initial declarations are only allowed in C99 mode
for (Py_ssize_t i = 0; i < len; i++) {
^
BackendCompilerFailed Traceback (most recent call last)
Cell In[5], line 2
1 start_time = time.time()
----> 2 masks = mask_generator.generate(image)
3 print(time.time()-start_time)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:170, in SamAutomaticMaskGenerator.generate(self, image)
145 """
146 Generates masks for the given image.
147
(...)
166 the mask, given in XYWH format.
167 """
169 # Generate masks
--> 170 mask_data = self._generate_masks(image)
172 # Filter small disconnected regions and holes in masks
173 if self.min_mask_region_area > 0:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:213, in SamAutomaticMaskGenerator._generate_masks(self, image)
211 data = MaskData()
212 for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
--> 213 crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
214 data.cat(crop_data)
216 # Remove duplicate masks between crops
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/automatic_mask_generator.py:243, in SamAutomaticMaskGenerator._process_crop(self, image, crop_box, crop_layer_idx, orig_size)
241 cropped_im = image[y0:y1, x0:x1, :]
242 cropped_im_size = cropped_im.shape[:2]
--> 243 self.predictor.set_image(cropped_im)
245 # Get points for this crop
246 points_scale = np.array(cropped_im_size)[None, ::-1]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/predictor.py:60, in SamPredictor.set_image(self, image, image_format)
57 input_image_torch = torch.as_tensor(input_image, device=self.device)
58 input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
---> 60 self.set_torch_image(input_image_torch, image.shape[:2])
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/pytorch_labs_segment_anything_fast-0.2-py3.11.egg/segment_anything_fast/predictor.py:90, in SamPredictor.set_torch_image(self, transformed_image, original_image_size)
88 input_image = self.model.preprocess(transformed_image)
89 model_dtype = self.model.mask_decoder.iou_prediction_head.layers[0].weight.dtype
---> 90 self.features = self.model.image_encoder(input_image.to(model_dtype))
91 self.is_image_set = True
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:489, in _TorchDynamoContext.call.._fn(*args, **kwargs)
487 dynamo_config_ctx.enter()
488 try:
--> 489 return fn(*args, **kwargs)
490 finally:
491 set_eval_frame(prior)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:655, in catch_errors_wrapper..catch_errors(frame, cache_entry, frame_state)
652 return hijacked_callback(frame, cache_entry, hooks, frame_state)
654 with compile_lock, _disable_current_modes():
--> 655 return callback(frame, cache_entry, hooks, frame_state)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:727, in convert_frame.._convert_frame(frame, cache_entry, hooks, frame_state)
725 counters["frames"]["total"] += 1
726 try:
--> 727 result = inner_convert(frame, cache_entry, hooks, frame_state)
728 counters["frames"]["ok"] += 1
729 return result
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:383, in convert_frame_assert.._convert_frame_assert(frame, cache_entry, hooks, frame_state)
370 signpost_event(
371 "dynamo",
372 "_convert_frame_assert._compile",
(...)
379 },
380 )
382 with config.patch(_patch_config_if_changed()):
--> 383 compiled_product = _compile(
384 frame.f_code,
385 frame.f_globals,
386 frame.f_locals,
387 frame.f_builtins,
388 compiler_fn,
389 one_graph,
390 export,
391 export_constraints,
392 hooks,
393 cache_size,
394 frame,
395 frame_state=frame_state,
396 compile_id=compile_id,
397 )
398 return compiled_product
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:646, in _compile(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)
644 with compile_context(CompileContext(compile_id)):
645 try:
--> 646 guarded_code = compile_inner(code, one_graph, hooks, transform)
647 return guarded_code
648 except (
649 Unsupported,
650 TorchRuntimeError,
(...)
657 BisectValidationException,
658 ) as e:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:562, in _compile..compile_inner(code, one_graph, hooks, transform)
560 CompileContext.get().attempt = attempt
561 try:
--> 562 out_code = transform_code_object(code, transform)
563 break
564 except exc.RestartAnalysis as e:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py:1033, in transform_code_object(code, transformations, safe)
1030 instructions = cleaned_instructions(code, safe)
1031 propagate_line_nums(instructions)
-> 1033 transformations(instructions, code_options)
1034 return clean_and_assemble_instructions(instructions, keys, code_options)[1]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:151, in preserve_global_state.._fn(*args, **kwargs)
149 cleanup = setup_compile_debug()
150 try:
--> 151 return fn(*args, **kwargs)
152 finally:
153 cleanup.close()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:527, in _compile..transform(instructions, code_options)
525 try:
526 with tracing(tracer.output.tracing_context), tracer.set_current_tx():
--> 527 tracer.run()
528 except exc.UnspecializeRestartAnalysis:
529 speculation_log.clear()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:2128, in InstructionTranslator.run(self)
2127 def run(self):
-> 2128 super().run()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:818, in InstructionTranslatorBase.run(self)
813 try:
814 self.output.push_tx(self)
815 while (
816 self.instruction_pointer is not None
817 and not self.output.should_exit
--> 818 and self.step()
819 ):
820 pass
821 except BackendCompilerFailed:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:781, in InstructionTranslatorBase.step(self)
777 unimplemented(f"missing: {inst.opname}")
778 TracingContext.set_current_loc(
779 self.f_code.co_filename, self.lineno, self.f_code.co_name
780 )
--> 781 getattr(self, inst.opname)(inst)
783 return inst.opname != "RETURN_VALUE"
784 except Unsupported:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py:2243, in InstructionTranslator.RETURN_VALUE(self, inst)
2238 _step_logger()(
2239 logging.INFO,
2240 f"torchdynamo done tracing {self.f_code.co_name} (RETURN_VALUE)",
2241 )
2242 log.debug("RETURN_VALUE triggered compile")
-> 2243 self.output.compile_subgraph(
2244 self,
2245 reason=GraphCompileReason(
2246 "return_value", [self.frame_summary()], graph_break=False
2247 ),
2248 compile_return_value=True,
2249 )
2250 self.output.add_output_instructions([create_instruction("RETURN_VALUE")])
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/output_graph.py:919, in OutputGraph.compile_subgraph(self, tx, partial_convert, reason, compile_return_value)
916 append_prefix_insts()
917 # optimization to generate better code in a common case
918 self.add_output_instructions(
--> 919 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
920 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
921 )
922 else:
923 graph_output_var = self.new_var("graph_out")
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/contextlib.py:81, in ContextDecorator.call..inner(*args, **kwds)
78 @wraps(func)
79 def inner(*args, **kwds):
80 with self._recreate_cm():
---> 81 return func(*args, **kwds)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/output_graph.py:1087, in OutputGraph.compile_and_call_fx_graph(self, tx, rv, root)
1084 self.tracing_context.fake_mode = backend_fake_mode
1086 with self.restore_global_state():
-> 1087 compiled_fn = self.call_user_compiler(gm)
1088 compiled_fn = disable(compiled_fn)
1090 counters["stats"]["unique_graphs"] += 1
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/output_graph.py:1159, in OutputGraph.call_user_compiler(self, gm)
1157 raise e
1158 except Exception as e:
-> 1159 raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
1160 e.traceback
1161 ) from None
1163 signpost_event(
1164 "dynamo",
1165 "OutputGraph.call_user_compiler",
(...)
1171 },
1172 )
1174 return compiled_fn
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/output_graph.py:1140, in OutputGraph.call_user_compiler(self, gm)
1138 if config.verify_correctness:
1139 compiler_fn = WrapperBackend(compiler_fn)
-> 1140 compiled_fn = compiler_fn(gm, self.example_inputs())
1141 _step_logger()(logging.INFO, f"done compiler function {name}")
1142 assert callable(compiled_fn), "compiler_fn did not return callable"
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/repro/after_dynamo.py:117, in wrap_backend_debug..debug_wrapper(gm, example_inputs, **kwargs)
115 raise
116 else:
--> 117 compiled_gm = compiler_fn(gm, example_inputs)
119 return compiled_gm
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/init.py:1662, in TorchCompileInductorWrapper.call(self, model, inputs_)
1659 def call(self, model_, inputs_):
1660 from torch.inductor.compile_fx import compile_fx
-> 1662 return compile_fx(model, inputs_, config_patches=self.config)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/inductor/compile_fx.py:952, in compile_fx(model, example_inputs_, inner_compile, config_patches, decompositions)
950 if config_patches:
951 with config.patch(config_patches):
--> 952 return compile_fx(
953 model_,
954 example_inputs_,
955 # need extra layer of patching as backwards is compiled out of scope
956 inner_compile=config.patch(config_patches)(inner_compile),
957 decompositions=decompositions,
958 )
960 if config.cpp_wrapper:
961 with config.patch(
962 {
963 "cpp_wrapper": False,
(...)
967 }
968 ), V.set_real_inputs(example_inputs_):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/inductor/compile_fx.py:1168, in compile_fx(model, example_inputs_, inner_compile, config_patches, decompositions)
1163 return inference_compiler(unlifted_gm, example_inputs_)
1165 with V.set_fake_mode(fake_mode), torch.guards.tracing(
1166 tracing_context
1167 ), compiled_autograd.disable():
-> 1168 return aot_autograd(
1169 fw_compiler=fw_compiler,
1170 bw_compiler=bw_compiler,
1171 inference_compiler=inference_compiler,
1172 decompositions=decompositions,
1173 partition_fn=partition_fn,
1174 keep_inference_input_mutations=True,
1175 )(model, example_inputs_)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/backends/common.py:55, in aot_autograd..compiler_fn(gm, example_inputs)
52 try:
53 # NB: NOT cloned!
54 with enable_aot_logging(), patch_config:
---> 55 cg = aot_module_simplified(gm, example_inputs, **kwargs)
56 counters["aot_autograd"]["ok"] += 1
57 return disable(cg)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/aot_autograd.py:887, in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)
871 aot_config = AOTConfig(
872 fw_compiler=fw_compiler,
873 bw_compiler=bw_compiler,
(...)
883 no_tangents=False,
884 )
886 with compiled_autograd.disable():
--> 887 compiled_fn = create_aot_dispatcher_function(
888 functional_call,
889 full_args,
890 aot_config,
891 )
893 # TODO: There is something deeply wrong here; compiled_fn running with
894 # the boxed calling convention, but aot_module_simplified somehow
895 # historically returned a function that was not the boxed calling
896 # convention. This should get fixed...
897 def forward(*runtime_args):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/aot_autograd.py:600, in create_aot_dispatcher_function(flat_fn, flat_args, aot_config)
597 compiler_fn = partial(aot_wrapper_dedupe, compiler_fn=compiler_fn)
598 # You can put more passes here
--> 600 compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
601 if aot_config.is_export:
602 mutated_user_inp_locs = [
603 idx - aot_config.num_params_buffers
604 for idx in fw_metadata.mutated_inp_runtime_indices
605 if idx >= aot_config.num_params_buffers
606 ]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425, in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)
422 break
424 if ok:
--> 425 return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
427 if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
428 raise RuntimeError(
429 """
430 Encountered duplicate inputs that are mutated in the graph, but at least one input/output
431 to the graph is a tensor subclass. This is not supported today. You can try to
432 remove the aliasing yourself as a workaround, or otherwise file an issue on github."""
433 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630, in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)
628 # Happy path: we don't need synthetic bases
629 if synthetic_base_info is None:
--> 630 return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
632 # export path: ban synthetic bases for now, add later if requested.
633 if requires_subclass_dispatch(flat_args, fw_metadata):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:97, in aot_dispatch_base(flat_fn, flat_args, aot_config, fw_metadata)
91 if tracing_context := torch._guards.TracingContext.try_get():
92 tracing_context.fw_metadata = (
93 fw_metadata
94 if maybe_subclass_meta is None
95 else maybe_subclass_meta.fw_metadata
96 )
---> 97 compiled_fw = compiler(fw_module, updated_flat_args)
99 # This boxed_call handling happens inside create_runtime_wrapper as well.
100 # However, create_runtime_wrapper does not expect the rng offsets in the
101 # output. So, we have to create another wrapper and take out the offset. As
102 # a result, we have to account for not boxed_call compilers as well.
103 if not hasattr(compiled_fw, "_boxed_call"):
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1100, in compile_fx..fw_compiler_base(model, example_inputs, is_inference)
1092 assert orig_output_end_idx <= num_model_outputs
1094 user_visible_outputs = {
1095 n.name
1096 for n in model_outputs[original_output_start_index:orig_output_end_idx]
1097 if isinstance(n, torch.fx.Node)
1098 }
-> 1100 return inner_compile(
1101 model,
1102 example_inputs,
1103 num_fixed=fixed,
1104 cudagraphs=cudagraphs,
1105 graph_id=graph_id,
1106 is_inference=is_inference,
1107 boxed_forward_device_index=forward_device,
1108 user_visible_outputs=user_visible_outputs,
1109 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/contextlib.py:81, in ContextDecorator.call..inner(*args, **kwds)
78 @wraps(func)
79 def inner(*args, **kwds):
80 with self._recreate_cm():
---> 81 return func(*args, **kwds)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/repro/after_aot.py:83, in wrap_compiler_debug..debug_wrapper(gm, example_inputs, **kwargs)
78 assert config.repro_after in ("dynamo", "aot", None)
80 try:
81 # Call the compiler_fn - which is either aot_autograd or inductor
82 # with fake inputs
---> 83 inner_compiled_fn = compiler_fn(gm, example_inputs)
84 except Exception as e:
85 # TODO: Failures here are troublesome because no real inputs,
86 # need a different serialization strategy
87 if config.repro_after == "aot":
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/debug.py:305, in DebugContext.wrap..inner(*args, **kwargs)
302 @functools.wraps(fn)
303 def inner(*args, **kwargs):
304 with DebugContext():
--> 305 return fn(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/contextlib.py:81, in ContextDecorator.call..inner(*args, **kwds)
78 @wraps(func)
79 def inner(*args, **kwds):
80 with self._recreate_cm():
---> 81 return func(*args, **kwds)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:320, in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)
316 compiled_graph = FxGraphCache.load(
317 fx_codegen_and_compile, gm, example_inputs, graph_kwargs
318 )
319 else:
--> 320 compiled_graph = fx_codegen_and_compile(
321 gm, example_inputs, **graph_kwargs # type: ignore[arg-type]
322 )
324 log.debug("FX codegen and compilation took %.3fs", time.time() - start)
326 # Return the output strides to the caller via TracingContext
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:535, in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)
519 graph = GraphLowering(
520 gm,
521 # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
(...)
532 is_inference=is_inference,
533 )
534 with V.set_graph_handler(graph):
--> 535 graph.run(*example_inputs)
536 output_strides: List[Optional[Tuple[int, ...]]] = []
537 if graph.graph_outputs is not None:
538 # We'll put the output strides in the compiled graph so we
539 # can later return them to the caller via TracingContext
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_dynamo/utils.py:244, in dynamo_timed..dynamo_timed_inner..time_wrapper(*args, **kwargs)
242 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
243 t0 = time.time()
--> 244 r = func(*args, **kwargs)
245 time_spent = time.time() - t0
246 compilation_time_metrics[key].append(time_spent)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:519, in GraphLowering.run(self, *args)
517 @dynamo_timed
518 def run(self, *args):
--> 519 return super().run(*args)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/fx/interpreter.py:138, in Interpreter.run(self, initial_env, enable_io_processing, *args)
135 continue
137 try:
--> 138 self.env[node] = self.run_node(node)
139 except Exception as e:
140 if self.extra_traceback:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:814, in GraphLowering.run_node(self, n)
812 debug("layout_constraints")
813 args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
--> 814 result = self.call_function(n.target, args, kwargs)
815 elif is_magic_method(n.target):
816 # TODO: this is sus, it probably should be handled in the
817 # lowerings themselves similarly to sym_size/sym-stride
818 debug("is_magic_method")
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:694, in GraphLowering.call_function(self, target, args, kwargs)
692 return out
693 except Exception as e:
--> 694 raise LoweringException(e, target, args, kwargs).with_traceback(
695 e.traceback
696 ) from None
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/graph.py:691, in GraphLowering.call_function(self, target, args, kwargs)
689 try:
690 log.debug(" via %s", lowerings[target])
--> 691 out = lowerings[target](*args, **kwargs)
692 return out
693 except Exception as e:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/lowering.py:291, in _register_lowering..wrapped(*args, **kwargs)
288 if unpacked:
289 args = [args]
--> 291 out = decomp_fn(*args, **kwargs)
292 validate_ir(out)
294 return out
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py:367, in convolution(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)
363 return convert_1x1_conv_to_mm(x, weight, bias)
365 if bias is not None and ir.get_device_type(x) != "cpu":
366 # peel off the bias, cudnn is slower with it
--> 367 result = convolution(x, weight, None, **kwargs)
368 return L[aten.add](
369 result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1])
370 )
372 x.realize()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/lowering.py:291, in _register_lowering..wrapped(*args, **kwargs)
288 if unpacked:
289 args = [args]
--> 291 out = decomp_fn(*args, **kwargs)
292 validate_ir(out)
294 return out
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py:457, in convolution(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)
432 for cfg in conv_configs(
433 sympy_product([x.get_size()[0], *x.get_size()[2:]]),
434 out_chan,
435 in_chan,
436 ):
437 conv2d_template.maybe_append_choice(
438 choices,
439 input_nodes=(x, weight),
(...)
454 **cfg.kwargs,
455 )
--> 457 return autotune_select_algorithm("convolution", choices, args, layout)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:991, in autotune_select_algorithm(*args, **kwargs)
989 if _ALGORITHM_SELECTOR_CACHE is None:
990 _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
--> 991 return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:748, in AlgorithmSelectorCache.call(self, name, choices, input_nodes, layout, input_gen_fns)
745 tuning_pool.initialize()
747 autotune_start_ts = time.time()
--> 748 timings = self.lookup(
749 choices,
750 name,
751 repr([self.key_of(x) for x in input_nodes]),
752 autotune,
753 )
754 autotune_elapse = time.time() - autotune_start_ts
755 if timings == {} or choices[0] not in timings:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/codecache.py:291, in PersistentCache.lookup(self, choices, name, inputs, benchmark)
285 if not check_cache(local_cache) and not (
286 use_global_cache()
287 and check_cache(self.get_global_cache(), callback=log_stats)
288 ):
289 try:
290 # re-benchmark everything to try to get consistent numbers from the same machine
--> 291 timings = benchmark(choices)
292 assert all(choice in timings for choice in choices)
294 local_cache.setdefault(name, {})
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:739, in AlgorithmSelectorCache.call..autotune(choices)
738 def autotune(choices):
--> 739 return make_benchmark_fn()(choices)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:848, in AlgorithmSelectorCache.make_benchmark_fn..benchmark_in_current_process(choices)
846 for choice in choices:
847 try:
--> 848 timing = benchmark_choice_in_current_process(choice)
849 except CUDACompileError as e:
850 log.warning(
851 "CUDA compilation error: \n%s. \nIgnore this choice.", str(e)
852 )
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:838, in AlgorithmSelectorCache.make_benchmark_fn..benchmark_choice_in_current_process(choice)
835 result = choice.benchmark(*example_inputs_extern, out=out_extern)
836 else:
837 # triton templates want the base pointer for sliced tensors
--> 838 result = choice.benchmark(*example_inputs, out=out)
839 if VERIFY:
840 torch.testing.assert_close(out_extern, expected, **VERIFY)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py:604, in TritonTemplateCaller.benchmark(self, out, *args)
602 def benchmark(self, *args, out):
603 assert self.bmreq is not None
--> 604 return self.bmreq.benchmark(*args, output_tensor=out)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/autotune_process.py:452, in BenchmarkRequest.benchmark(self, output_tensor, *input_tensors)
449 load_elapse = time.time() - start_ts
450 start_ts = time.time()
--> 452 out = do_bench(fn)
453 torch.cuda.synchronize() # shake out any CUDA errors
455 if debug:
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/torch/_inductor/utils.py:167, in do_bench(*args, **kwargs)
165 if quantile_field_name not in kwargs:
166 kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
--> 167 return triton_do_bench(*args, **kwargs)[0]
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/testing.py:102, in do_bench(fn, warmup, rep, grad_to_none, quantiles, fast_flush, return_mode)
83 import torch
84 """
85 Benchmark the runtime of the provided function. By default, return the median runtime of :code:
fn
along with86 the 20-th and 80-th performance percentile.
(...)
99 :type fast_flush: bool
100 """
--> 102 fn()
103 torch.cuda.synchronize()
105 # We maintain a buffer of 256 MB that we clear
106 # before each kernel call to make sure that the L2
107 # doesn't contain any input data before the run
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/jit.py:550, in JITFunction.run(self, *args, **kwargs)
548 bin = self.cache[device][key]
549 if not warmup:
--> 550 bin.c_wrapper(
551 grid_0,
552 grid_1,
553 grid_2,
554 bin.num_warps,
555 bin.num_ctas,
556 bin.clusterDims[0],
557 bin.clusterDims[1],
558 bin.clusterDims[2],
559 bin.shared,
560 stream,
561 bin.cu_function,
562 CompiledKernel.launch_enter_hook,
563 CompiledKernel.launch_exit_hook,
564 bin,
565 *bin.assemble_tensormap_to_arg(non_constexpr_arg_values),
566 )
567 return bin
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/compiler/compiler.py:692, in CompiledKernel.getattribute(self, name)
690 def getattribute(self, name):
691 if name == 'c_wrapper':
--> 692 self._init_handles()
693 return super().getattribute(name)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/compiler/compiler.py:670, in CompiledKernel._init_handles(self)
668 if self.device_type in ["cuda"]:
669 device = get_current_device()
--> 670 bin_path = {driver.HIP: "hsaco_path", driver.CUDA: "cubin"}[driver.backend]
671 max_shared = driver.utils.get_device_properties(device)["max_shared_mem"]
672 fn_load_binary = driver.utils.load_binary
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/driver.py:157, in LazyProxy.getattr(self, name)
156 def getattr(self, name):
--> 157 self._initialize_obj()
158 return getattr(self._obj, name)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/driver.py:154, in LazyProxy._initialize_obj(self)
152 def _initialize_obj(self):
153 if self._obj is None:
--> 154 self._obj = self._init_fn()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/driver.py:187, in initialize_driver()
185 return HIPDriver()
186 elif torch.cuda.is_available():
--> 187 return CudaDriver()
188 else:
189 return UnsupportedDriver()
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/driver.py:77, in CudaDriver.init(self)
76 def init(self):
---> 77 self.utils = CudaUtils()
78 self.backend = self.CUDA
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/runtime/driver.py:47, in CudaUtils.init(self)
45 with open(src_path, "w") as f:
46 f.write(src)
---> 47 so = _build("cuda_utils", src_path, tmpdir)
48 with open(so, "rb") as f:
49 cache_path = cache.put(f.read(), fname, binary=True)
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/common/build.py:103, in _build(name, src, srcdir)
98 cc_cmd = [
99 cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda",
100 "-o", so
101 ]
102 cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs]
--> 103 ret = subprocess.check_call(cc_cmd)
105 if ret == 0:
106 return so
File ~/anaconda3/envs/segment_fast_env/lib/python3.11/subprocess.py:413, in check_call(*popenargs, **kwargs)
411 if cmd is None:
412 cmd = popenargs[0]
--> 413 raise CalledProcessError(retcode, cmd)
414 return 0
BackendCompilerFailed: backend='inductor' raised:
LoweringException: CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpaqkjsyxw/main.c', '-O3', '-I/root/anaconda3/envs/segment_fast_env/lib/python3.11/site-packages/triton/common/../third_party/cuda/include', '-I/root/anaconda3/envs/segment_fast_env/include/python3.11', '-I/tmp/tmpaqkjsyxw', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpaqkjsyxw/cuda_utils.cpython-311-x86_64-linux-gnu.so', '-L/lib64', '-L/lib', '-L/lib64', '-L/lib']' returned non-zero exit status 1.
target: aten.convolution.default
args[0]: TensorBox(StorageBox(
InputBuffer(name='arg457_1', layout=FixedLayout('cuda', torch.bfloat16, size=[1, 3, 1024, 1024], stride=[3145728, 1048576, 1024, 1]))
))
args[1]: TensorBox(StorageBox(
InputBuffer(name='arg69_1', layout=FixedLayout('cuda', torch.bfloat16, size=[1280, 3, 16, 16], stride=[768, 256, 16, 1]))
))
args[2]: TensorBox(StorageBox(
InputBuffer(name='arg70_1', layout=FixedLayout('cuda', torch.bfloat16, size=[1280], stride=[1]))
))
args[3]: [16, 16]
args[4]: [0, 0]
args[5]: [1, 1]
args[6]: False
args[7]: [0, 0]
args[8]: 1
Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
You can suppress this exception and fall back to eager by setting:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
The text was updated successfully, but these errors were encountered: