train_novel_shapenet.py

import argparse
import random
import torch
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import open_clip
from utils.mv_utils_zs_ver_2 import Realistic_Projection_Learnable_new as Realistic_Projection 
from model.PointNet import PointNetfeat, feature_transform_regularizer, STN3d
from model.curvenet import *
from model.Transformation import Transformation
from utils.datautil_3D_memory_incremental_shapenet import *
from model.Relation import RelationNetwork
import os
import numpy as np
from matplotlib import pyplot as plt
from torch import nn
from utils.Loss import CombinedConstraintLoss
from model.Unet_dropout import UNetPlusPlus
from torchmetrics.functional.image import image_gradients
from configs.shapenet_info import task_ids_total as tid
import json
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def set_random_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# read a txt file line by line and save it in a list, and remove the empty lines
def read_txt_file(file):
    with open(file, 'r') as f:
        array = f.readlines()
    array = ["A depth map of " + x.strip() for x in array]
    array = list(filter(None, array))
    return array


def read_txt_file_class_name(file):
    with open(file, 'r') as f:
        array = f.readlines()
    array = [x.strip() for x in array]
    array = list(filter(None, array))
    return array

# read json file
def read_json_file(file):
    with open(file, 'r') as f:
        array = json.load(f)
    return array

def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

# define the main function
def main(opt):
    
    num_rotations = 1
    fea_weight = 0.8
    set_random_seed(opt.manualSeed) 
    
    # import pointnet model
    #pointnet = PointNetfeat(global_feat=True, feature_transform=opt.feature_transform)
    #pointnet = pointnet.to(device)
    curvenet = CurveNet()
    curvenet = curvenet.to(device)
    curvenet.load_state_dict(torch.load('cls/shapenet/curvenet_40.pth', map_location=device))

    # Step 1: Load CLIP model
    clip_model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion2b_s34b_b88k')
    clip_model.to(device)
    for param in clip_model.parameters():
        param.requires_grad = False

    # Step 2: Load Realistic Projection object
    proj = Realistic_Projection().to(device)
    # Step 3: Load the Transformation model
    transform = {str(i): STN3d() for i in range(num_rotations)}
    for i in range(num_rotations):
        transform[str(i)].to(device)
        transform[str(i)].load_state_dict(torch.load('cls/shapenet/transform_40_%d.pth' % i, map_location=device))

    # load the Unet model
    unet = UNetPlusPlus().to(device)
    unet.load_state_dict(torch.load('cls/shapenet/unet_40.pth', map_location=device))
   
    # Step 4: Load the Relation Network
    relation = RelationNetwork(1536, 2048, 1024)
    relation = relation.to(device)
    relation.load_state_dict(torch.load('cls/shapenet/relation_40.pth', map_location=device))

    
    #load the text features
    class_name = read_txt_file_class_name("class_name_shapenet.txt")
    prompts = read_json_file("shapenet.json")
    

    # define the optimizer
    Parameters = [p for model in transform.values() for p in model.parameters()]
    #optimizer = optim.Adam(Parameters + list(relation.parameters())  + list(unet.parameters()) + list(pointnet.parameters()), lr=0.001, betas=(0.9, 0.999))

    optimizer = optim.Adam(relation.parameters(), lr=0.001, betas=(0.9, 0.999))


    # load loss function
    cross_entrpy = nn.BCELoss()
    constraint_loss = CombinedConstraintLoss(num_rotations=num_rotations)
    loss_orthogonal_weight = 0.01
    mse_loss = nn.MSELoss()

   
    # load memory bank as a numpy array
    memory_bank = np.load('memory/memory_bank_shapenet.npy')
    memory_bank_label = np.load('memory/memory_bank_label_shapenet.npy')
    memory_bank = torch.from_numpy(memory_bank).to(device)
    memory_bank_label = torch.from_numpy(memory_bank_label).to(device)

 
    for t in range(0,7):
        path=Path(opt.dataset_path)
        print(path)
        dataloader = DatasetGen(opt, root=path, fewshot=5)
        dataset = dataloader.get(t,'training')
        trainDataLoader = dataset[t]['train']
        testDataLoader = dataset[t]['test'] 
        num_category = 25 + t * 5
        print('task:', t)
        # train the model
        clip_model.train()
        for i in range(num_rotations):
            transform[format(i)].train()
        unet.train()
        relation.train()
        curvenet.train()
        print("=> Start training the model")
        # construct the memory bank
        memory_bank_task = memory_bank[0:(num_category-5),:,:]
        memory_bank_label_task = memory_bank_label[0:(num_category-5),:]
        mm = 0
        if t == 0:
           nepoch = 0
        else: 
           nepoch = opt.nepoch
           
        for epoch in range(nepoch):
            # define the loss
            train_loss = 0
            train_correct = 0
            train_total = 0
            for i, data in tqdm(enumerate(trainDataLoader, 0)):
                points, target = data['pointclouds'].to(device).float(), data['labels'].to(device)
                points, target = points.to(device), target.to(device)
                if points.shape[0] < opt.batch_size:
                    continue

                # Select 16 samples from memory_bank_task and memory_bank_label_task
                indices = torch.randperm(memory_bank_task.shape[0])[:opt.batch_size]
                memory_bank_task_samples = memory_bank_task[indices, :, :]
                memory_bank_label_task_samples = memory_bank_label_task[indices, :]
                points = torch.cat((points, memory_bank_task_samples), 0)
                target = torch.cat((target, memory_bank_label_task_samples.squeeze(1)), 0)


                optimizer.zero_grad()
                points = points.transpose(2, 1)

                # Forward samples to the PointNet model
                points_embedding = curvenet(points)

                # transformation module
                trans = torch.zeros((points.shape[0], num_rotations, 3, 3), device=device)
                for jj in range(num_rotations):
                    trans[:, jj, :, :] = transform[format(jj)](points)
                loss_orthogonal = constraint_loss(trans).mean()
                            
                # depth map generation
                points = points.transpose(2, 1)   
                depth_map = torch.zeros((points.shape[0] * num_rotations, 3, 224, 224)).to(device)  
                for jj in range(num_rotations):
                    depth_map_tmp = proj.get_img(points, trans[:,jj,:,:].view(-1, 9))    
                    depth_map_tmp = torch.nn.functional.interpolate(depth_map_tmp, size=(224, 224), mode='bilinear', align_corners=True)
                    depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0], :, :, :] = depth_map_tmp
                
                loss_gradient = 0
                RGB_map = torch.zeros((points.shape[0] * num_rotations, 3, 224, 224)).to(device)
                for jj in range(num_rotations):
                    # unet model
                    depth_map_reverse = 1 - depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0]]
                    mask = (depth_map_reverse != 0).float()
                    texture_map = unet(mask)
                    # loss for gradient
                    dy_init, dx_init = image_gradients(mask)
                    dy, dx = image_gradients(texture_map)
                    loss_gradient += mse_loss(dy, dy_init) + mse_loss(dx, dx_init)
                    RGB_map[jj * points.shape[0]:(jj + 1) * points.shape[0], :, :, :] = depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0]] * texture_map

                # Forward samples to the vision CLIP model
                img_embedding_tmp = clip_model.encode_image(RGB_map).to(device)
                img_embedding = 0
                for jj in range(num_rotations):
                    img_embedding += img_embedding_tmp[jj * points.shape[0]:(jj + 1) * points.shape[0], :]/ num_rotations
                
                # merge img_embedding and points_embedding
                img_embedding = img_embedding / img_embedding.norm(dim=-1, keepdim=True)
                points_embedding = points_embedding / points_embedding.norm(dim=-1, keepdim=True)
                fea_embedding = torch.cat((img_embedding, points_embedding), 1)

                # Sample prompts from prompts dictionary
                tid_all = []
                for h in range(t+1):                
                    tid_all += tid[h]
                prompts_batch = []
                for j in range(num_category):
                    tmp_1 = (class_name[tid_all[j]])
                    tmp_1 = tmp_1.split(' ')
                    tmp_2 = prompts[tmp_1[1]]
                    random_idx = random.randint(0, len(tmp_2)-1)
                    prompts_batch.append(tmp_2[random_idx])
                # Forward samples to the text CLIP model
                text = open_clip.tokenize(prompts_batch)
                text_embedding = clip_model.encode_text(text.to(device))

                # normalize the text embedding
                text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
                            
                # forwarding samples to the Relation module
                text_embedding = text_embedding.unsqueeze(0).repeat((opt.batch_size)*2,1,1).to(device)
                fea_embedding = fea_embedding.unsqueeze(0).repeat(num_category,1,1)
                fea_embedding = torch.transpose(fea_embedding,0,1).to(device)
                relation_pairs = torch.cat((text_embedding.float(),fea_embedding.float()),2).view(-1,1536)
                relations = relation(relation_pairs.float()).view(-1, num_category).to(device)

                # cllculate the loss
                one_hot_labels = (torch.zeros((opt.batch_size)*2, num_category).to(device).scatter_(1, target.long().view(-1,1), 1))
                loss_t = cross_entrpy(relations, one_hot_labels)
                loss = loss_t + loss_orthogonal * loss_orthogonal_weight + loss_gradient
                loss.backward(retain_graph=True)
                optimizer.step()

                   
                # Calculating the accuracy
                train_loss += loss.clone().detach().item()
                prediction = relations.cpu().detach().numpy()
                prediction = np.argmax(prediction, axis=1)
                target = target.cpu().detach().numpy()
                train_total += target.shape[0]
                train_correct += np.sum(prediction == target)

                # delete the variables to free the memory
                del points, target, depth_map, img_embedding, text_embedding, loss
                torch.cuda.empty_cache()
            print('Relation Module','Point embedding + img _embedding:',loss_orthogonal_weight, 'number of view', num_rotations)    
            print(f"=> Epoch {epoch} loss: {train_loss:.2f} accuracy: {100 * train_correct / train_total:.2f}")

        # evaluate the model       
        base_class_correct = 0
        base_class_total = 0
       

        for i in range(num_rotations):
            transform[format(i)].eval()
        relation.eval() 
        unet.eval()
        clip_model.eval()
        curvenet.eval()
        #load the text features
        prompts_test = read_txt_file("class_name_shapenet.txt")
        text = open_clip.tokenize(prompts_test)
        text_embedding_all_classes = clip_model.encode_text(text.to(device))
        task1, task2, task3, task4, task5, task6, task7, task1_total, task2_total, task3_total, task4_total, task5_total, task6_total, task7_total = [0] * 12
        tid_all = []
        for h in range(t+1):                
            tid_all += tid[h]

        for j, data in tqdm(enumerate(testDataLoader, 0)):
            points, target = data['pointclouds'].to(device).float(), data['labels'].to(device)
            points, target = points.to(device), target.to(device)
            features_2D = torch.zeros((1, 512), device=device)
            with torch.no_grad():
                    
                    depth_map = torch.zeros((points.shape[0] * num_rotations, 3, 110, 110)).to(device)
                    # Forward samples to the PointNet model
                    points = points.transpose(2, 1)
                    points = points.repeat(2, 1, 1)   
                    points_embedding = curvenet(points)

                    # transformation module
                    trans = torch.zeros((points.shape[0], num_rotations, 3, 3), device=device)
                    for jj in range(num_rotations):
                        trans[:, jj, :, :] = transform[format(jj)](points)
                    
                    # depth map generation
                    points = points.transpose(2, 1)   
                    depth_map = torch.zeros((points.shape[0] * num_rotations, 3, 224, 224)).to(device)  
                    for jj in range(num_rotations):
                        depth_map_tmp = proj.get_img(points, trans[:,jj,:,:].view(-1, 9))    
                        depth_map_tmp = torch.nn.functional.interpolate(depth_map_tmp, size=(224, 224), mode='bilinear', align_corners=True)
                        depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0], :, :, :] = depth_map_tmp 
                
                    RGB_map = torch.zeros((points.shape[0] * num_rotations, 3, 224, 224)).to(device) 
                    for jj in range(num_rotations):
                        # unet model
                        depth_map_reverse = 1 - depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0]]
                        mask = (depth_map_reverse != 0).float()
                        texture_map = unet(mask)
                        RGB_map[jj * points.shape[0]:(jj + 1) * points.shape[0], :, :, :] = depth_map[jj * points.shape[0]:(jj + 1) * points.shape[0]] * texture_map

                    # Forward samples to the CLIP model
                    img_embedding_tmp = clip_model.encode_image(RGB_map).to(device)
                    img_embedding = 0
                    for jj in range(num_rotations):
                        img_embedding += img_embedding_tmp[jj * points.shape[0]:(jj + 1) * points.shape[0], :]/ num_rotations

                    # merge img_embedding and points_embedding
                    img_embedding = img_embedding / img_embedding.norm(dim=-1, keepdim=True)
                    points_embedding = points_embedding / points_embedding.norm(dim=-1, keepdim=True)
                    fea_embedding = torch.cat((img_embedding, points_embedding), 1)
                    fea_embedding = fea_embedding[0,:].unsqueeze(0)

                    # Forward samples to the text CLIP model
                    text_embedding = text_embedding_all_classes[tid_all].to(device)
                    text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
                    
                    # forwarding samples to the Relation module
                    text_embedding = text_embedding.unsqueeze(0).repeat(1,1,1).to(device)
                    fea_embedding = fea_embedding.unsqueeze(0).repeat(num_category,1,1).to(device)
                    fea_embedding = torch.transpose(fea_embedding,0,1).to(device)
                    relation_pairs = torch.cat((text_embedding.float(),fea_embedding.float()),2).view(-1,1536)
                    relations = relation(relation_pairs.float()).view(-1, num_category).to(device)
                

            prediction = relations.cpu().detach().numpy()
            prediction = np.argmax(prediction, axis=1)

            if prediction == target.cpu().detach().numpy():
                base_class_correct += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() < 25:
                task1 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 25 and target.cpu().detach().numpy() < 30:
                task2 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 30 and target.cpu().detach().numpy() < 35:
                task3 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 35 and target.cpu().detach().numpy() < 40:
                task4 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 40 and target.cpu().detach().numpy() < 45:
                task5 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 45 and target.cpu().detach().numpy() < 50:
                task6 += 1
            if prediction == target.cpu().detach().numpy() and target.cpu().detach().numpy() >= 50 and target.cpu().detach().numpy() < 55:
                task7 += 1
            # tasks total number samples
            if target.cpu().detach().numpy() < 25:
                task1_total += 1
            if target.cpu().detach().numpy() >= 25 and target.cpu().detach().numpy() < 30:
                task2_total += 1
            if target.cpu().detach().numpy() >= 30 and target.cpu().detach().numpy() < 35:
                task3_total += 1
            if target.cpu().detach().numpy() >= 35 and target.cpu().detach().numpy() < 40:
                task4_total += 1
            if target.cpu().detach().numpy() >= 40 and target.cpu().detach().numpy() < 45:
                task5_total += 1
            if target.cpu().detach().numpy() >= 45 and target.cpu().detach().numpy() < 50:
                task6_total += 1
            if target.cpu().detach().numpy() >= 50 and target.cpu().detach().numpy() < 55:
                task7_total += 1

        acc = (base_class_correct / testDataLoader.__len__()) * 100
        
        if task1_total > 0:
           print('task1:', task1/task1_total)
        if task2_total > 0:
           print('task2:', task2/task2_total)
        if task3_total > 0:
           print('task3:', task3/task3_total)
        if task4_total > 0:
           print('task4:', task4/task4_total)
        if task5_total > 0:
           print('task5:', task5/task5_total)
        if task6_total > 0:
           print('task6:', task6/task6_total)
        if task7_total > 0:
           print('task7:', task7/task7_total)
        print(f"=> total accuracy: {acc:.2f}")
        print('-------------------------------------------------------------------------')
        # put the models in the training mode

        for i in range(num_rotations):
            transform[format(i)].train()
        relation.train()
        unet.train()
        clip_model.train()
        curvenet.train()
    

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default= 16, help='input batch size')
    parser.add_argument('--num_points', type=int, default=2048, help='number of points in each input point cloud')
    parser.add_argument('--workers', type=int, help='number of data loading workers', default=4)
    parser.add_argument('--nepoch', type=int, default=14, help='number of epochs to train for')
    parser.add_argument('--outf', type=str, default='cls', help='output folder to save results')
    parser.add_argument('--model', type=str, default='cls/3D_model_249.pth', help='path to load a pre-trained model')
    parser.add_argument('--feature_transform', action='store_true', help='use feature transform')
    parser.add_argument('--manualSeed', type=int, default = 42, help='random seed')
    parser.add_argument('--dataset_path', type=str, default= 'dataset/FSCIL/shapenet/', help="dataset path")
    parser.add_argument('--ntasks', type=str, default= '5', help="number of tasks")
    parser.add_argument('--nclasses', type=str, default= '25', help="number of classes")
    parser.add_argument('--task', type=str, default= '0', help="task number")
    parser.add_argument('--num_samples', type=str, default= '0', help="number of samples per class")
    parser.add_argument('--process_data', action='store_true', default=False, help='save data offline')
    parser.add_argument('--num_point', type=int, default=2048, help='Point Number')
    parser.add_argument('--use_uniform_sample', action='store_true', default=False, help='use uniform sampiling')
    parser.add_argument('--use_normals', action='store_true', default=False, help='use normals')
    parser.add_argument('--num_category', default=25, type=int, choices=[20, 40],  help='training on ModelNet10/40')
    parser.add_argument('--sem_file', default=None,  help='training on ModelNet10/40')
    parser.add_argument('--use_memory', default=False, help='use_memory')
    parser.add_argument('--herding', default=True, help='herding')
    opt = parser.parse_args()
    main(opt)
    print("Done!")