From d39f47a8bcc4b29f02d0a0b6e72cba821669776f Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Mon, 21 Sep 2020 23:44:53 +0530 Subject: [PATCH 01/11] Introduce 'balance' arg in flow_from_directory --- keras_preprocessing/image/image_data_generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keras_preprocessing/image/image_data_generator.py b/keras_preprocessing/image/image_data_generator.py index 77d88147..f5f1e9c9 100644 --- a/keras_preprocessing/image/image_data_generator.py +++ b/keras_preprocessing/image/image_data_generator.py @@ -441,6 +441,7 @@ def flow(self, def flow_from_directory(self, directory, + balance = False, target_size=(256, 256), color_mode='rgb', classes=None, @@ -465,6 +466,7 @@ def flow_from_directory(self, See [this script]( https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details. + balance: Boolean, handles data imbalance if True target_size: Tuple of integers `(height, width)`, default: `(256, 256)`. The dimensions to which all images found will be resized. From e594e3cbf55fe30bea6f03a6cb44f7f78d41f8da Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Tue, 22 Sep 2020 00:20:29 +0530 Subject: [PATCH 02/11] Introduce 'balance' arg in DirectoryIterator init --- keras_preprocessing/image/directory_iterator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keras_preprocessing/image/directory_iterator.py b/keras_preprocessing/image/directory_iterator.py index 3a829b4f..36c3eb95 100644 --- a/keras_preprocessing/image/directory_iterator.py +++ b/keras_preprocessing/image/directory_iterator.py @@ -25,6 +25,7 @@ class DirectoryIterator(BatchFromFilesMixin, Iterator): via the `classes` argument. image_data_generator: Instance of `ImageDataGenerator` to use for random transformations and normalization. + balance: Boolean, will handle data imbalance if set to True. target_size: tuple of integers, dimensions to resize input images to. color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read images. @@ -76,6 +77,7 @@ def __new__(cls, *args, **kwargs): def __init__(self, directory, image_data_generator, + balance = False, target_size=(256, 256), color_mode='rgb', classes=None, From 8b9110fee1e1c5264a8ce17002ee6bf09cdb425d Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Tue, 22 Sep 2020 00:25:19 +0530 Subject: [PATCH 03/11] Add '_balance_config' attr to DirectoryIterator _balance_config: dict, supposed to store relevant key-value pairs important for further steps to handle data imbalance _balance_config is not generated for validation subset as it does not need resampling (oversampling/handing data imbalance) Yet to implement _make_balance_config --- keras_preprocessing/image/directory_iterator.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/keras_preprocessing/image/directory_iterator.py b/keras_preprocessing/image/directory_iterator.py index 36c3eb95..aa05dd92 100644 --- a/keras_preprocessing/image/directory_iterator.py +++ b/keras_preprocessing/image/directory_iterator.py @@ -11,7 +11,8 @@ import numpy as np from .iterator import BatchFromFilesMixin, Iterator -from .utils import _list_valid_filenames_in_directory +from .utils import (_list_valid_filenames_in_directory, + _make_balance_config) class DirectoryIterator(BatchFromFilesMixin, Iterator): @@ -103,6 +104,13 @@ def __init__(self, subset, interpolation) self.directory = directory + + if balance and subset != 'validation': + self._balance_config = _make_balance_config(directory, + image_data_generator._validation_split) + else: + self._balance_config = None + self.classes = classes if class_mode not in self.allowed_class_modes: raise ValueError('Invalid class_mode: {}; expected one of: {}' From 0be974b0a40357ee35e674e21b3fbbc8277a6f8e Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Tue, 22 Sep 2020 00:47:19 +0530 Subject: [PATCH 04/11] Implement '_make_balance_config' Scans the directory and generates a dict object which stores the configurations that will be used for handling data imbalance. Yet to implement '_generate_class_count' --- keras_preprocessing/image/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index bc3e6886..c9fb63e2 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -228,6 +228,29 @@ def _list_valid_filenames_in_directory(directory, white_list_formats, split, return classes, filenames +def _make_balance_config(directory, validation_split): + """Scans the directory to make a config dictionary to handle data imbalance. + # Arguments + directory: string, absolute path to the directory + validation_split: float, validation split + Default: None + # Returns + balance_config: dictionary, specs needed to handle data imbalance + 'majority': integer, number of samples in the majority class + """ + class_count = _generate_class_count(directory) + + # Get the sample count of the majority class + majority_class_count = class_count[max(class_count, key = class_count.get)] + if validation_split: + majority_class_count = int(majority_class_count*(1 - validation_split)) + 1 + + balance_config = { + 'majority': majority_class_count + } + + return balance_config + def array_to_img(x, data_format='channels_last', scale=True, dtype='float32'): """Converts a 3D Numpy array to a PIL Image instance. From 12d844d0442f4a79912d7797538098da80e15232 Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Tue, 22 Sep 2020 00:52:40 +0530 Subject: [PATCH 05/11] Implement '_generate_class_count' Scans the directory and generates a dict object which maintains a class count (no. of image samples in each class/category) --- keras_preprocessing/image/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index c9fb63e2..b5871b18 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -227,9 +227,28 @@ def _list_valid_filenames_in_directory(directory, white_list_formats, split, return classes, filenames +def _generate_class_count(directory): + """Maintain sample count of each class in the directory. + + # Arguments + directory: string, absolute path to the directory + # Returns + class_count: dictionary, sample count for each class + """ + + class_count = {} + + if directory[-1] != '/': + directory += '/' + + for category in os.listdir(directory): + class_count[category] = len(os.listdir(directory + category)) + + return class_count def _make_balance_config(directory, validation_split): """Scans the directory to make a config dictionary to handle data imbalance. + # Arguments directory: string, absolute path to the directory validation_split: float, validation split From 7cb2c103de381235a05bd0348f8f196c29e067a2 Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 2 Oct 2020 18:49:26 +0530 Subject: [PATCH 06/11] Refactor '_generate_class_count' --- keras_preprocessing/image/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index b5871b18..aa777a21 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -238,11 +238,9 @@ def _generate_class_count(directory): class_count = {} - if directory[-1] != '/': - directory += '/' - for category in os.listdir(directory): - class_count[category] = len(os.listdir(directory + category)) + category_directory = os.path.join(directory, category) + class_count[category] = len(os.listdir(category_directory)) return class_count From a81511fd988a0672e39fc30539b61cf3464647a6 Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 2 Oct 2020 19:17:50 +0530 Subject: [PATCH 07/11] Add oversampling in list valid files in directory Yet to implement '_settle_debt' --- keras_preprocessing/image/directory_iterator.py | 3 ++- keras_preprocessing/image/utils.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/keras_preprocessing/image/directory_iterator.py b/keras_preprocessing/image/directory_iterator.py index aa05dd92..8da1dc05 100644 --- a/keras_preprocessing/image/directory_iterator.py +++ b/keras_preprocessing/image/directory_iterator.py @@ -139,7 +139,8 @@ def __init__(self, results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, self.white_list_formats, self.split, - self.class_indices, follow_links))) + self.class_indices, follow_links, + self._balance_config))) classes_list = [] for res in results: classes, filenames = res.get() diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index aa777a21..8e9d1038 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -183,7 +183,8 @@ def _recursive_list(subpath): def _list_valid_filenames_in_directory(directory, white_list_formats, split, - class_indices, follow_links): + class_indices, follow_links, + balance_config = None): """Lists paths of files in `subdir` with extensions in `white_list_formats`. # Arguments @@ -198,6 +199,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats, split, of images in each directory. class_indices: dictionary mapping a class name to its index. follow_links: boolean, follow symbolic links to subdirectories. + balance_config: dict, stores configurations for handling data imbalance. # Returns classes: a list of class indices @@ -225,6 +227,15 @@ def _list_valid_filenames_in_directory(directory, white_list_formats, split, dirname, os.path.relpath(absolute_path, directory)) filenames.append(relative_path) + if balance_config: + filenames_copy = filenames.copy() + + debt = balance_config['majority'] - len(filenames_copy) + + for filename in _settle_debt(filenames_copy, debt): + classes.append(class_indices[dirname]) + filenames.append(filename) + return classes, filenames def _generate_class_count(directory): From d9d2993cb6c291cc6dc43473c5a3ae09dedaf298 Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 2 Oct 2020 20:30:47 +0530 Subject: [PATCH 08/11] Implement '_settle_debt' --- keras_preprocessing/image/utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index 8e9d1038..307ac4f0 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -7,6 +7,7 @@ import io import os import warnings +import random import numpy as np @@ -181,6 +182,19 @@ def _recursive_list(subpath): if fname.lower().endswith(white_list_formats): yield root, fname +def _settle_debt(list_valid_files, debt): + """Iterates over list_valid_files and resamples to settle debt. + + # Arguments: + list_valid_files: List of strings, list that contains valid filenames + debt: Integer, required number of samples to be resampled from + valid_file_names + + # Yields: + randomly chosen filename from list_valid_files + """ + for i in range(debt): + yield random.choice(list_valid_files) def _list_valid_filenames_in_directory(directory, white_list_formats, split, class_indices, follow_links, From 71cabcf53690792a8bc762944bf96729e9d9165d Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 2 Oct 2020 21:32:06 +0530 Subject: [PATCH 09/11] Set balance arg in flow from directory --- keras_preprocessing/image/image_data_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_preprocessing/image/image_data_generator.py b/keras_preprocessing/image/image_data_generator.py index f5f1e9c9..20e17fb0 100644 --- a/keras_preprocessing/image/image_data_generator.py +++ b/keras_preprocessing/image/image_data_generator.py @@ -533,6 +533,7 @@ class subdirectories (default: False). return DirectoryIterator( directory, self, + balance=balance, target_size=target_size, color_mode=color_mode, classes=classes, From 6e4e9d0dd375d9591f51334d29420657941ed454 Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 9 Oct 2020 16:59:48 +0530 Subject: [PATCH 10/11] Make changes based on autopep8 --- .../image/directory_iterator.py | 6 ++--- .../image/image_data_generator.py | 2 +- keras_preprocessing/image/utils.py | 23 +++++++++++-------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/keras_preprocessing/image/directory_iterator.py b/keras_preprocessing/image/directory_iterator.py index 8da1dc05..ecb8c654 100644 --- a/keras_preprocessing/image/directory_iterator.py +++ b/keras_preprocessing/image/directory_iterator.py @@ -78,7 +78,7 @@ def __new__(cls, *args, **kwargs): def __init__(self, directory, image_data_generator, - balance = False, + balance=False, target_size=(256, 256), color_mode='rgb', classes=None, @@ -106,8 +106,8 @@ def __init__(self, self.directory = directory if balance and subset != 'validation': - self._balance_config = _make_balance_config(directory, - image_data_generator._validation_split) + self._balance_config = _make_balance_config(directory, + image_data_generator._validation_split) else: self._balance_config = None diff --git a/keras_preprocessing/image/image_data_generator.py b/keras_preprocessing/image/image_data_generator.py index 20e17fb0..a978166e 100644 --- a/keras_preprocessing/image/image_data_generator.py +++ b/keras_preprocessing/image/image_data_generator.py @@ -441,7 +441,7 @@ def flow(self, def flow_from_directory(self, directory, - balance = False, + balance=False, target_size=(256, 256), color_mode='rgb', classes=None, diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index 307ac4f0..da2d3b18 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -182,6 +182,7 @@ def _recursive_list(subpath): if fname.lower().endswith(white_list_formats): yield root, fname + def _settle_debt(list_valid_files, debt): """Iterates over list_valid_files and resamples to settle debt. @@ -196,9 +197,10 @@ def _settle_debt(list_valid_files, debt): for i in range(debt): yield random.choice(list_valid_files) + def _list_valid_filenames_in_directory(directory, white_list_formats, split, - class_indices, follow_links, - balance_config = None): + class_indices, follow_links, + balance_config=None): """Lists paths of files in `subdir` with extensions in `white_list_formats`. # Arguments @@ -252,26 +254,28 @@ def _list_valid_filenames_in_directory(directory, white_list_formats, split, return classes, filenames + def _generate_class_count(directory): """Maintain sample count of each class in the directory. - + # Arguments directory: string, absolute path to the directory # Returns class_count: dictionary, sample count for each class """ - + class_count = {} for category in os.listdir(directory): - category_directory = os.path.join(directory, category) - class_count[category] = len(os.listdir(category_directory)) + category_directory = os.path.join(directory, category) + class_count[category] = len(os.listdir(category_directory)) return class_count + def _make_balance_config(directory, validation_split): """Scans the directory to make a config dictionary to handle data imbalance. - + # Arguments directory: string, absolute path to the directory validation_split: float, validation split @@ -283,9 +287,9 @@ def _make_balance_config(directory, validation_split): class_count = _generate_class_count(directory) # Get the sample count of the majority class - majority_class_count = class_count[max(class_count, key = class_count.get)] + majority_class_count = class_count[max(class_count, key=class_count.get)] if validation_split: - majority_class_count = int(majority_class_count*(1 - validation_split)) + 1 + majority_class_count = int(majority_class_count*(1 - validation_split)) + 1 balance_config = { 'majority': majority_class_count @@ -293,6 +297,7 @@ def _make_balance_config(directory, validation_split): return balance_config + def array_to_img(x, data_format='channels_last', scale=True, dtype='float32'): """Converts a 3D Numpy array to a PIL Image instance. From 80ed78660140fea91da730f00dab04a082dfd43c Mon Sep 17 00:00:00 2001 From: Divyanshu Gupta Date: Fri, 9 Oct 2020 17:15:11 +0530 Subject: [PATCH 11/11] Make changes acc to pep8 conventions --- keras_preprocessing/image/directory_iterator.py | 3 ++- keras_preprocessing/image/utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/keras_preprocessing/image/directory_iterator.py b/keras_preprocessing/image/directory_iterator.py index ecb8c654..11fef919 100644 --- a/keras_preprocessing/image/directory_iterator.py +++ b/keras_preprocessing/image/directory_iterator.py @@ -106,8 +106,9 @@ def __init__(self, self.directory = directory if balance and subset != 'validation': + validation_split = image_data_generator._validation_split self._balance_config = _make_balance_config(directory, - image_data_generator._validation_split) + validation_split) else: self._balance_config = None diff --git a/keras_preprocessing/image/utils.py b/keras_preprocessing/image/utils.py index da2d3b18..30f67426 100644 --- a/keras_preprocessing/image/utils.py +++ b/keras_preprocessing/image/utils.py @@ -192,7 +192,7 @@ def _settle_debt(list_valid_files, debt): valid_file_names # Yields: - randomly chosen filename from list_valid_files + randomly chosen filename from list_valid_files """ for i in range(debt): yield random.choice(list_valid_files)