podgorskiy
diff --git a/‎configs/mnist.yaml
+18 b/‎configs/mnist.yaml
+18
diff --git a/‎dataloading.py
+138 b/‎dataloading.py
+138
diff --git a/‎defaults.py
+44 b/‎defaults.py
+44
diff --git a/‎evaluation.py
+128 b/‎evaluation.py
+128
@@ -0,0 +1,18 @@
+DATASET:
+  FOLDS_COUNT: 5
+  MEAN: 0.1307
+  PATH: mnist
+  STD: 0.3081
+  TOTAL_CLASS_COUNT: 10
+  PERCENTAGES: [10, 20, 30, 40, 50]
+MODEL:
+  LATENT_SIZE: 16
+  Z_DISCRIMINATOR_CROSS_BATCH: False
+  INPUT_IMAGE_SIZE: 32
+  INPUT_IMAGE_CHANNELS: 1
+OUTPUT_DIR: results
+TRAIN:
+  BASE_LEARNING_RATE: 0.002
+  BATCH_SIZE: 128
+  EPOCH_COUNT: 80
+
@@ -0,0 +1,138 @@
+# Copyright 2018-2020 Stanislav Pidhorskyi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch.utils.data
+from net import *
+import pickle
+import numpy as np
+from os import path
+import dlutils
+import warnings
+
+
+class Dataset:
+    @staticmethod
+    def list_of_pairs_to_numpy(l):
+        return np.asarray([x[1] for x in l], np.float32), np.asarray([x[0] for x in l], np.int)
+
+    def __init__(self, data):
+        self.x, self.y = Dataset.list_of_pairs_to_numpy(data)
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return self.y[index.start:index.stop], self.x[index.start:index.stop]
+        return self.y[index], self.x[index]
+
+    def __len__(self):
+        return len(self.y)
+
+    def shuffle(self):
+        permutation = np.random.permutation(self.y.shape[0])
+        for x in [self.y, self.x]:
+            np.take(x, permutation, axis=0, out=x)
+
+
+def make_datasets(cfg, folding_id, inliner_classes):
+    data_train = []
+    data_valid = []
+
+    for i in range(cfg.DATASET.FOLDS_COUNT):
+        if i != folding_id:
+            with open(path.join(cfg.DATASET.PATH, 'data_fold_%d.pkl' % i), 'rb') as pkl:
+                fold = pickle.load(pkl)
+            if len(data_valid) == 0:
+                data_valid = fold
+            else:
+                data_train += fold
+
+    outlier_classes = []
+    for i in range(cfg.DATASET.TOTAL_CLASS_COUNT):
+        if i not in inliner_classes:
+            outlier_classes.append(i)
+
+    data_train = [x for x in data_train if x[0] in inliner_classes]
+
+    with open(path.join(cfg.DATASET.PATH, 'data_fold_%d.pkl') % folding_id, 'rb') as pkl:
+        data_test = pickle.load(pkl)
+
+    train_set = Dataset(data_train)
+    valid_set = Dataset(data_valid)
+    test_set = Dataset(data_test)
+
+    return train_set, valid_set, test_set
+
+
+def make_dataloader(dataset, batch_size, device):
+    class BatchCollator(object):
+        def __init__(self, device):
+            self.device = device
+
+        def __call__(self, batch):
+            with torch.no_grad():
+                y, x = batch
+                x = torch.tensor(x / 255.0, requires_grad=True, dtype=torch.float32, device=self.device)
+                y = torch.tensor(y, dtype=torch.int32, device=self.device)
+                return y, x
+
+    data_loader = dlutils.batch_provider(dataset, batch_size, BatchCollator(device))
+    return data_loader
+
+
+def create_set_with_outlier_percentage(dataset, inliner_classes, target_percentage, concervative=True):
+    np.random.seed(0)
+    dataset.shuffle()
+    dataset_outlier = [x for x in dataset if x[0] not in inliner_classes]
+    dataset_inliner = [x for x in dataset if x[0] in inliner_classes]
+
+    def increase_length(data_list, target_length):
+        repeat = (target_length + len(data_list) - 1) // len(data_list)
+        data_list = data_list * repeat
+        data_list = data_list[:target_length]
+        return data_list
+
+    if not concervative:
+        inliner_count = len(dataset_inliner)
+        outlier_count = inliner_count * target_percentage // (100 - target_percentage)
+
+        if len(dataset_outlier) > outlier_count:
+            dataset_outlier = dataset_outlier[:outlier_count]
+        else:
+            outlier_count = len(dataset_outlier)
+            inliner_count = outlier_count * (100 - target_percentage) // target_percentage
+            dataset_inliner = dataset_inliner[:inliner_count]
+    else:
+        inliner_count = len(dataset_inliner)
+        outlier_count = len(dataset_outlier)
+
+        current_percentage = outlier_count * 100 / (outlier_count + inliner_count)
+
+        if current_percentage < target_percentage:  # we don't have enought outliers
+            outlier_count = int(inliner_count * target_percentage / (100.0 - target_percentage))
+            dataset_outlier = increase_length(dataset_outlier, outlier_count)
+        else:  # we don't have enought inliers
+            inlier_count = int(outlier_count * (100.0 - target_percentage) / target_percentage)
+            dataset_inliner = increase_length(dataset_inliner, inlier_count)
+
+    dataset = Dataset(dataset_outlier + dataset_inliner)
+
+    dataset.shuffle()
+
+    # Post checks
+    outlier_count = len([1 for x in dataset if x[0] not in inliner_classes])
+    inliner_count = len([1 for x in dataset if x[0] in inliner_classes])
+    real_percetage = outlier_count * 100.0 / (outlier_count + inliner_count)
+    assert abs(real_percetage - target_percentage) < 0.01, "Didn't create dataset with requested percentage of outliers"
+
+    return dataset
@@ -0,0 +1,44 @@
+from yacs.config import CfgNode as CN
+
+
+_C = CN()
+
+_C.OUTPUT_DIR = "results"
+
+_C.DATASET = CN()
+
+_C.DATASET.PERCENTAGES = [10, 20, 30, 40, 50]
+
+# Values for MNIST
+_C.DATASET.MEAN = 0.1307
+_C.DATASET.STD = 0.3081
+
+_C.DATASET.PATH = "mnist"
+_C.DATASET.TOTAL_CLASS_COUNT = 10
+_C.DATASET.FOLDS_COUNT = 5
+
+_C.MODEL = CN()
+_C.MODEL.LATENT_SIZE = 32
+_C.MODEL.INPUT_IMAGE_SIZE = 32
+_C.MODEL.INPUT_IMAGE_CHANNELS = 1
+# If zd_merge true, will use zd discriminator that looks at entire batch.
+_C.MODEL.Z_DISCRIMINATOR_CROSS_BATCH = False
+
+
+_C.TRAIN = CN()
+
+_C.TRAIN.BATCH_SIZE = 256
+_C.TRAIN.EPOCH_COUNT = 80
+_C.TRAIN.BASE_LEARNING_RATE = 0.002
+
+_C.TEST = CN()
+_C.TEST.BATCH_SIZE = 1024
+
+_C.MAKE_PLOTS = True
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
@@ -0,0 +1,128 @@
+import numpy as np
+from sklearn.metrics import roc_auc_score
+import pickle
+import os
+
+
+def get_f1(true_positive, false_positive, false_negative):
+    if true_positive == 0:
+        return 0.0
+    precision = true_positive / (true_positive + false_positive)
+    recall = true_positive / (true_positive + false_negative)
+    return 2.0 * precision * recall / (precision + recall)
+
+
+def evaluate(logger, percentage_of_outliers, inliner_classes, prediction, threshold, gt_inlier):
+    y = np.greater(prediction, threshold)
+
+    gt_outlier = np.logical_not(gt_inlier)
+
+    true_positive = np.sum(np.logical_and(y, gt_inlier))
+    true_negative = np.sum(np.logical_and(np.logical_not(y), gt_outlier))
+    false_positive = np.sum(np.logical_and(y, gt_outlier))
+    false_negative = np.sum(np.logical_and(np.logical_not(y), gt_inlier))
+    total_count = true_positive + true_negative + false_positive + false_negative
+
+    accuracy = 100 * (true_positive + true_negative) / total_count
+
+    y_true = gt_inlier
+    y_scores = prediction
+
+    try:
+        auc = roc_auc_score(y_true, y_scores)
+    except:
+        auc = 0
+
+    logger.info("Percentage %f" % percentage_of_outliers)
+    logger.info("Accuracy %f" % accuracy)
+    f1 = get_f1(true_positive, false_positive, false_negative)
+    logger.info("F1 %f" % get_f1(true_positive, false_positive, false_negative))
+    logger.info("AUC %f" % auc)
+
+    # return dict(auc=auc, f1=f1)
+
+    # inliers
+    X1 = [x[1] for x in zip(gt_inlier, prediction) if x[0]]
+
+    # outliers
+    Y1 = [x[1] for x in zip(gt_inlier, prediction) if not x[0]]
+
+    minP = min(prediction) - 1
+    maxP = max(prediction) + 1
+
+    ##################################################################
+    # FPR at TPR 95
+    ##################################################################
+    fpr95 = 0.0
+    clothest_tpr = 1.0
+    dist_tpr = 1.0
+    for threshold in np.arange(minP, maxP, 0.2):
+        tpr = np.sum(np.greater_equal(X1, threshold)) / np.float(len(X1))
+        fpr = np.sum(np.greater_equal(Y1, threshold)) / np.float(len(Y1))
+        if abs(tpr - 0.95) < dist_tpr:
+            dist_tpr = abs(tpr - 0.95)
+            clothest_tpr = tpr
+            fpr95 = fpr
+
+    logger.info("tpr: %f" % clothest_tpr)
+    logger.info("fpr95: %f" % fpr95)
+
+    ##################################################################
+    # Detection error
+    ##################################################################
+    error = 1.0
+    for threshold in np.arange(minP, maxP, 0.2):
+        tpr = np.sum(np.less(X1, threshold)) / np.float(len(X1))
+        fpr = np.sum(np.greater_equal(Y1, threshold)) / np.float(len(Y1))
+        error = np.minimum(error, (tpr + fpr) / 2.0)
+
+    logger.info("Detection error: %f" % error)
+
+    ##################################################################
+    # AUPR IN
+    ##################################################################
+    auprin = 0.0
+    recallTemp = 1.0
+    for threshold in np.arange(minP, maxP, 0.2):
+        tp = np.sum(np.greater_equal(X1, threshold))
+        fp = np.sum(np.greater_equal(Y1, threshold))
+        if tp + fp == 0:
+            continue
+        precision = tp / (tp + fp)
+        recall = tp / np.float(len(X1))
+        auprin += (recallTemp - recall) * precision
+        recallTemp = recall
+    auprin += recall * precision
+
+    logger.info("auprin: %f" % auprin)
+
+    ##################################################################
+    # AUPR OUT
+    ##################################################################
+    minP, maxP = -maxP, -minP
+    X1 = [-x for x in X1]
+    Y1 = [-x for x in Y1]
+    auprout = 0.0
+    recallTemp = 1.0
+    for threshold in np.arange(minP, maxP, 0.2):
+        tp = np.sum(np.greater_equal(Y1, threshold))
+        fp = np.sum(np.greater_equal(X1, threshold))
+        if tp + fp == 0:
+            continue
+        precision = tp / (tp + fp)
+        recall = tp / np.float(len(Y1))
+        auprout += (recallTemp - recall) * precision
+        recallTemp = recall
+    auprout += recall * precision
+
+    logger.info("auprout: %f" % auprout)
+
+    with open(os.path.join("results.txt"), "a") as file:
+        file.write(
+            "Class: %s\n Percentage: %d\n"
+            "Error: %f\n F1: %f\n AUC: %f\nfpr95: %f"
+            "\nDetection: %f\nauprin: %f\nauprout: %f\n\n" %
+            ("_".join([str(x) for x in inliner_classes]), percentage_of_outliers, error, f1, auc, fpr95, error, auprin, auprout))
+
+    return dict(auc=auc, f1=f1, fpr95=fpr95, error=error, auprin=auprin, auprout=auprout)
+    # return auc, f1, fpr95, error, auprin, auprout