|
| 1 | +from functools import partial |
| 2 | +from detrex.config import get_config |
| 3 | +from detrex.modeling.backbone.eva import get_vit_lr_decay_rate |
| 4 | + |
| 5 | +from ..models.dino_eva_02 import model |
| 6 | +from ..common.coco_loader_lsj_1280 import dataloader |
| 7 | + |
| 8 | +# get default config |
| 9 | +optimizer = get_config("common/optim.py").AdamW |
| 10 | +lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_12ep |
| 11 | +train = get_config("common/train.py").train |
| 12 | + |
| 13 | + |
| 14 | +# modify model config |
| 15 | +model.backbone.net.img_size = 1280 |
| 16 | +model.backbone.square_pad = 1280 |
| 17 | +model.backbone.net.patch_size = 16 |
| 18 | +model.backbone.net.window_size = 16 |
| 19 | +model.backbone.net.embed_dim = 1024 |
| 20 | +model.backbone.net.depth = 24 |
| 21 | +model.backbone.net.num_heads = 16 |
| 22 | +model.backbone.net.mlp_ratio = 4*2/3 |
| 23 | +model.backbone.net.use_act_checkpoint = True |
| 24 | +model.backbone.net.drop_path_rate = 0.4 |
| 25 | + |
| 26 | +# 5, 11, 17, 23 for global attention |
| 27 | +model.backbone.net.window_block_indexes = ( |
| 28 | + list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) |
| 29 | +) |
| 30 | + |
| 31 | +# modify training config |
| 32 | +train.init_checkpoint = "/path/to/eva02_L_pt_m38m_p14to16.pt" |
| 33 | +train.output_dir = "./output/dino_eva_02_vitdet_l_4attn_1024_lrd0p8_4scale_12ep" |
| 34 | + |
| 35 | +# max training iterations |
| 36 | +train.max_iter = 90000 |
| 37 | + |
| 38 | + |
| 39 | +# gradient clipping for training |
| 40 | +train.clip_grad.enabled = True |
| 41 | +train.clip_grad.params.max_norm = 0.1 |
| 42 | +train.clip_grad.params.norm_type = 2 |
| 43 | + |
| 44 | +# set training devices |
| 45 | +train.device = "cuda" |
| 46 | +model.device = train.device |
| 47 | + |
| 48 | +# modify optimizer config |
| 49 | +optimizer.lr = 1e-4 |
| 50 | +optimizer.betas = (0.9, 0.999) |
| 51 | +optimizer.weight_decay = 1e-4 |
| 52 | +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) |
| 53 | +optimizer.params.overrides = {} |
| 54 | +optimizer.params.weight_decay_norm = None |
| 55 | + |
| 56 | +# modify dataloader config |
| 57 | +dataloader.train.num_workers = 16 |
| 58 | + |
| 59 | +# please notice that this is total batch size. |
| 60 | +# surpose you're using 4 gpus for training and the batch size for |
| 61 | +# each gpu is 16/4 = 4 |
| 62 | +dataloader.train.total_batch_size = 16 |
| 63 | + |
0 commit comments