-
Notifications
You must be signed in to change notification settings - Fork 1
/
newmatic_demo.m
184 lines (147 loc) · 6.97 KB
/
newmatic_demo.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
%% Newmatic
%
% Create new MAT-files optimized for partial reading and writing of large arrays
%% Overview
%
% The purpose of this tool is to provide more and easier control over MAT-file formatting. In
% particular, I had some performance problems with partial IO for large arrays and found the
% solutions suggested by Mathworks to be pretty clunky. The three features that |newmatic|
% includes are:
%
% # Creating new array variables with a specified type
% # Allocating the array size (see <https://www.mathworks.com/help/matlab/import_export/troubleshooting-file-size-increases-unexpectedly-when-growing-an-array.html here>)
% # Defining sane chunk sizes (see "Accelerate Save and Load Operations for Version 7.3 MAT-Files" <https://www.mathworks.com/help/matlab/import_export/mat-file-versions.html here>)
%
% For our test case, we will make save a relatively large 3D array to a MAT-file one "page" at a
% time, and then read it back in the same way. This tasks mimics the task that inspired me to
% write this tool, namely partial reads from a stack of image arrays.
%
% TLDR: *Using newmatic makes partial access roughly the same speed as reading/writing whole
% variables, and does not have a significant impact on file size*. For this specific example,
% setting a sane chunk size yields ~20x speedup.
%
% Note: The fact that read timings are systematically lower than write times is likely an artifact
% of data caching in the underlying HDF5 library (see <https://support.hdfgroup.org/HDF5/doc/H5.user/Caching.html here>).
% Because we write before reading in this test, we likely end up reading from (fast) cache rather
% than (slow) disk. For this reason, the _relative_ times are more important than the absolute
% times.
%
% Here is the test data:
num_row = 2000;
num_col = 1000;
num_img = 50;
images = randi(255, num_row, num_col, num_img, 'uint8');
%% Complete read/write with native MATLAB tools
%
% As a baseline, we will use native MATLAB matfile() to write the data at once
% get a temporary file name
native_complete_file = [tempname, '.mat'];
native_complete_cleanup = onCleanup(@() delete(native_complete_file));
% create a matfile object
native_complete_mat = matfile(native_complete_file, 'Writable', true);
% populate the file at once
tic;
native_complete_mat.images = images;
native_complete_write_time = toc;
fprintf('Native-complete, write: %.3f s\n', native_complete_write_time);
% read the images back in from the file one at a time
tic;
[~] = native_complete_mat.images;
native_complete_read_time = toc;
fprintf('Native-complete, read: %.3f s\n', native_complete_read_time);
% get the file size
native_complete_file_obj = dir(native_complete_file);
native_complete_file_size = native_complete_file_obj.bytes/1024/1024;
%% Partial read/write with native MATLAB tools
%
% Now let's try using native MATLAB matfile() to do read and write the data one image at a time
% (i.e., partial IO). This is the real use case we are interested in.
% get a temporary file name
native_partial_file = [tempname, '.mat'];
native_partial_cleanup = onCleanup(@() delete(native_partial_file));
% create a matfile object
native_partial_mat = matfile(native_partial_file, 'Writable', true);
% allocate the array
% see: https://www.mathworks.com/help/matlab/import_export/troubleshooting-file-size-increases-unexpectedly-when-growing-an-array.html
native_partial_mat.images = uint8.empty(0, 0, 0);
native_partial_mat.images(num_row, num_col, num_img) = uint8(0);
% populate the file one image at a time
tic;
for ii = 1:num_img
native_partial_mat.images(:, :, ii) = images(:, :, ii);
end
native_partial_write_time = toc;
fprintf('Native-partial, write: %.3f s\n', native_partial_write_time);
% read the images back in from the file one at a time
tic;
for ii = 1:num_img
[~] = native_partial_mat.images(:, :, ii);
end
native_partial_read_time = toc;
fprintf('Native-partial, read: %.3f s\n', native_partial_read_time);
% get the file size
native_partial_file_obj = dir(native_partial_file);
native_partial_file_size = native_partial_file_obj.bytes/1024/1024;
%% Partial read/write with newmatic
%
% Now for the good stuff. Let's use |newmatic| to create our file, and then read and write the data
% one image at a time. We will choose a chunk size that neatly matches our planned access pattern
% (i.e, an image).
% get a temporary file name
newmatic_partial_file = [tempname, '.mat'];
newmatic_partial_cleanup = onCleanup(@() delete(newmatic_partial_file));
% create a matfile object with newmatic
var_size = [num_row, num_col, num_img];
var_chunk = [num_row, num_col, 1];
newmatic_partial_mat = newmatic(newmatic_partial_file, newmatic_variable('images', 'uint8', var_size, var_chunk));
% populate the file one image at a time
tic;
for ii = 1:num_img
newmatic_partial_mat.images(:, :, ii) = images(:, :, ii);
end
newmatic_partial_write_time = toc;
fprintf('Newmatic-partial, write: %.3f s\n', newmatic_partial_write_time);
% read the images back in from the file one at a time
tic;
for ii = 1:num_img
[~] = newmatic_partial_mat.images(:, :, ii);
end
newmatic_partial_read_time = toc;
fprintf('Newmatic-partial, read: %.3f s\n', newmatic_partial_read_time);
% get the file size
newmatic_partial_file_obj = dir(newmatic_partial_file);
newmatic_partial_file_size = newmatic_partial_file_obj.bytes/1024/1024;
%% Complete read/write with newmatic
%
% To round out the comparison, let's read/write whole variables using newmatic
% get a temporary file name
newmatic_complete_file = [tempname, '.mat'];
newmatic_complete_cleanup = onCleanup(@() delete(newmatic_complete_file));
% create a matfile object with newmatic
var_size = [num_row, num_col, num_img];
var_chunk = [num_row, num_col, 1];
newmatic_complete_mat = newmatic(newmatic_complete_file, newmatic_variable('images', 'uint8', var_size, var_chunk));
% populate the file at-once
tic;
newmatic_complete_mat.images = images;
newmatic_complete_write_time = toc;
fprintf('Newmatic-complete, write: %.3f s\n', newmatic_complete_write_time);
% read the images back in from the file at-once
tic;
[~] = newmatic_complete_mat.images;
newmatic_complete_read_time = toc;
fprintf('Newmatic-complete, read: %.3f s\n', newmatic_complete_read_time);
% get the file size
newmatic_complete_file_obj = dir(newmatic_complete_file);
newmatic_complete_file_size = newmatic_complete_file_obj.bytes/1024/1024;
%% Comparison
%
% To make the comparison a bit easier, check out the tabulated results below:
results = table(...
round([native_complete_write_time; newmatic_complete_write_time; native_partial_write_time; newmatic_partial_write_time], 2), ...
round([native_complete_read_time; newmatic_complete_read_time; native_partial_read_time; newmatic_partial_read_time], 2), ...
round([native_complete_file_size; newmatic_complete_file_size; native_partial_file_size; newmatic_partial_file_size], 2), ...
'RowNames', {'native-complete', 'newmatic-complete', 'native-partial', 'newmatic-partial'}, ...
'VariableNames', {'write-time-seconds', 'read-time-seconds', 'file-size-MB'}...
);
disp(results);