diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 39926543..9d4fda9b 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -239,15 +239,11 @@ def configure_optimizers(self): def log_outputs(self, outputs, step_name): - self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) - # if self.log_on_step: - # # seperate call to allow for different name, to allow for consistency with TF.keras auto-names + # self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) + # if outputs['predictions'].shape[1] == 2: # will only do for binary classifications # self.log( - # "{}/step_loss".format(step_name), outputs['loss'], on_epoch=False, on_step=True, prog_bar=True, logger=True, rank_zero_only=True) - if outputs['predictions'].shape[1] == 2: # will only do for binary classifications - # logging.info(predictions.shape, labels.shape) - self.log( - "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + # "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + pass def log_loss_per_question(self, multiq_loss, prefix): @@ -255,8 +251,9 @@ def log_loss_per_question(self, multiq_loss, prefix): # TODO need schema attribute or similar to have access to question names, this will do for now # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop - for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + # for question_n in range(multiq_loss.shape[1]): + # self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + pass diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 8e8ad050..33e724a2 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -275,9 +275,9 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) - early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) - - callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks + # early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) + # , early_stopping_callback + callbacks = [checkpoint_callback] + extra_callbacks trainer = pl.Trainer( log_every_n_steps=150, # at batch 512 (A100 MP max), DR5 has ~161 train steps @@ -290,12 +290,12 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins, - use_distributed_sampler=use_distributed_sampler + plugins=plugins + # use_distributed_sampler=use_distributed_sampler ) - logging.info((trainer.strategy, trainer.world_size, - trainer.local_rank, trainer.global_rank, trainer.node_rank)) + # logging.info((trainer.strategy, trainer.world_size, + # trainer.local_rank, trainer.global_rank, trainer.node_rank)) trainer.fit(lightning_model, datamodule) # uses batch size of datamodule