Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jittor.nn.ReLU() 导致发生底层崩溃 #593

Open
PhyllisJi opened this issue Sep 6, 2024 · 0 comments
Open

jittor.nn.ReLU() 导致发生底层崩溃 #593

PhyllisJi opened this issue Sep 6, 2024 · 0 comments

Comments

@PhyllisJi
Copy link

Describe the bug

jittor.nn.ReLU() 导致发生了底层崩溃

Full Log

Traceback (most recent call last):
  File "/home/moco_jt2/test.py", line 115, in <module>
    success, reason = train(x=None, x_t=x_t, y_t=y_t)
  File "/home/moco_jt2/test.py", line 78, in train
    jittor.flags.use_cuda = 1
RuntimeError: [f 0829 07:44:58.336236 00 executor.cc:682] 
Execute fused operator(3/48) failed. 
[JIT Source]: /root/.cache/jittor/jt1.3.7/g++8.4.0/py3.9.16/Linux-4.19.0-1xfe/IntelRXeonRPlaxe0/default/cu11.7.99_sm_70/jit/__opkey0_broadcast_to__Tx_float32__DIM_7__BCAST_19__opkey1_broadcast_to__Tx_int32__DIM_7_____hash_52d2cb7e3d055028_op.cc 
[OP TYPE]: fused_op:( broadcast_to, broadcast_to, binary.multiply, reindex_reduce.add,)
[Input]: float32[3,64,11,11,]conv1_mutated.weight, int32[1,3,224,224,], 
[Output]: float32[1,64,234,234,], 
[Async Backtrace]: not found, please set env JT_SYNC=1, trace_py_var=3 
[Reason]: could not create a primitive descriptor iterator
**********
Async error was detected. To locate the async backtrace and get better error report, please rerun your code with two enviroment variables set:
>>> export JT_SYNC=1
>>> export trace_py_var=3

Minimal Reproduce

import os
os.environ["disable_lock"] = "1"
import jittor
import jittor.nn as nn
import jittor.optim as optim
import numpy as np
import copy


class alexnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_mutated = jittor.nn.ConvTranspose2d(in_channels=3, kernel_size=11, out_channels=64)
        self.relu1_mutated = jittor.nn.Softmax()
        self.pool1_mutated = jittor.nn.ReplicationPad2d(padding=8)
        self.conv2_mutated = jittor.nn.ELU()
        self.relu2_mutated = jittor.nn.LeakyReLU()
        self.pool2_mutated = jittor.nn.MaxPool2d(kernel_size=3, stride=8, return_indices=False, ceil_mode=True)
        self.conv3_mutated = jittor.nn.Conv2d(in_channels=64, out_channels=384, kernel_size=(4, 8), padding=(8, 3), stride=8, groups=2, bias=False, dilation=(1, 1))
        self.relu3_mutated = jittor.nn.ReLU()
        self.tail_flatten = jittor.nn.Flatten()
        self.tail_fc = jittor.nn.Linear(in_features=9216, out_features=1000)
    
    def execute(self, x):
        x = self.conv1_mutated(x)
        x = self.relu1_mutated(x)
        x = self.pool1_mutated(x)
        x = self.conv2_mutated(x)
        x = self.relu2_mutated(x)
        x = self.pool2_mutated(x)
        x = self.conv3_mutated(x)
        x = self.relu3_mutated(x)
        x = self.tail_flatten(x)
        x = self.tail_fc(x)
        return x




def go():
    jittor.flags.use_cuda = 1
    x = jittor.randn([1, 3, 224, 224])
    m = alexnet()
    y = m(x)
    return list(y.shape)




def chebyshev_distance(A: np.ndarray, B: np.ndarray):
    if A is None or B is None:
        return 0.0
    if A.shape != B.shape:
        return 9999999
    else:
        return float(np.max(np.abs(A - B)))


def train(x, x_t, y_t):
    flag = True
    jittor.flags.use_cuda = 0
    m_c = alexnet()
    opt_c = optim.SGD(m_c.parameters(), lr=0.01)

    jittor.flags.use_cuda = 1
    m_g = copy.deepcopy(m_c)
    opt_g = optim.SGD(m_g.parameters(), lr=0.01)

    jittor.flags.use_cuda = 0
    input_c = jittor.array(x_t).float32()
    input_c = jittor.ceil_int(input_c)
    target_c = jittor.array(y_t)
    output_c = m_c(input_c)
    loss_c = nn.CrossEntropyLoss()(output_c, target_c)
    opt_c.backward(loss_c)

    jittor.flags.use_cuda = 1
    input_g = jittor.array(x_t).float32()
    input_g = jittor.ceil_int(input_g)
    target_g = jittor.array(y_t)
    output_g = m_g(input_g)
    loss_g = nn.CrossEntropyLoss()(output_g, target_g)
    opt_g.backward(loss_g)

    output_c_np = output_c.fetch_sync()
    output_g_np = output_g.fetch_sync()

    jittor.flags.use_cuda = 0
    if chebyshev_distance(output_c_np, output_g_np) > 0.1:
        flag = False
        jittor.clean()
        return flag, 'Output diff too big'
    if abs(loss_c.item() - loss_g.item()) > 0.1:
        flag = False
        jittor.clean()
        return flag, 'Loss diff too big'
    for (param_c, param_g) in zip(m_c.parameters(), m_g.parameters()):
        weights_c = param_c
        weights_g = param_g
        distance = chebyshev_distance(weights_c, weights_g)
        if distance > 0.1:
            flag = False
            break
    if not flag:
        jittor.clean()
        return flag, 'Grad diff too big'

    jittor.clean()
    return flag, ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant