Skip to content

Commit

Permalink
Merge branch 'master' into features/pipy_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Edward Grigoryan committed Dec 16, 2020
2 parents 303e94e + 775f8a5 commit 831fb70
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 20 deletions.
2 changes: 1 addition & 1 deletion hub/api/datasetview.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(
The number of samples in this DatasetView
offset: int
The offset from which the DatasetView starts
squuze_dim: bool
squeeze_dim: bool
For slicing with integers we would love to remove the first dimension to make it nicer
"""
if dataset is None:
Expand Down
22 changes: 7 additions & 15 deletions hub/compute/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@
from hub.defaults import OBJECT_CHUNK


def get_sample_size_in_memory(schema):
"""Given Schema, looks into memory how many samples can fit and returns it"""
def get_sample_size(schema, workers):
"""Given Schema, decides how many samples to take at once and returns it"""
schema = featurify(schema)
mem = virtual_memory()
sample_size = 0
samples = 10000
for feature in schema._flatten():
shp = list(feature.max_shape)
if len(shp) == 0:
Expand All @@ -38,11 +37,8 @@ def prod(shp):
res *= s
return res

sample_size += prod(shp) * sz

if sample_size > mem.available:
return 1
return int((mem.available // sample_size) * 0.8)
samples = min(samples, (16 * 1024 * 1024 * 8) // (prod(shp) * sz))
return samples * workers


class Transform:
Expand Down Expand Up @@ -335,12 +331,9 @@ def store(

# compute shard length
if sample_per_shard is None:
n_samples = get_sample_size_in_memory(self.schema)
n_samples = min(10000, n_samples)
n_samples = max(512, n_samples)
n_samples = get_sample_size(self.schema, self.workers)
else:
n_samples = sample_per_shard

try:
length = len(ds_in) if hasattr(ds_in, "__len__") else n_samples
except Exception:
Expand Down Expand Up @@ -372,11 +365,10 @@ def batchify_generator(iterator: Iterable, size: int):
for ds_in_shard in batchify_generator(ds_in, n_samples):
n_results = self.store_shard(ds_in_shard, ds_out, start, token=token)
total += n_results

pbar.update(n_results)
if n_results < n_samples or n_results == 0:
break
start += n_samples
pbar.update(n_samples)

ds_out.resize_shape(total)
ds_out.commit()
Expand Down
10 changes: 7 additions & 3 deletions hub/schema/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,14 @@ def get_attr_dict(self):
def __str__(self):
out = super().__str__()
out = "Segmentation" + out[6:-1]
out = out + ", names=" + self.names if self.names is not None else out
out = (
out + ", num_classes=" + self.num_classes
if self.num_classes is not None
out + ", names=" + str(self.class_labels._names)
if self.class_labels._names is not None
else out
)
out = (
out + ", num_classes=" + str(self.class_labels._num_classes)
if self.class_labels._num_classes is not None
else out
)
out += ")"
Expand Down
25 changes: 25 additions & 0 deletions hub/schema/tests/test_features.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from hub.schema import Segmentation
from hub.schema.class_label import ClassLabel, _load_names_from_file
from hub.schema.features import HubSchema, SchemaDict
import pytest
Expand Down Expand Up @@ -49,9 +50,33 @@ def test_feature_dict_repr():
assert expected_output == feature_dict_object.__repr__()


def test_segmentation_repr():
seg1 = Segmentation(shape=(3008, 3008), dtype="uint8", num_classes=5)
seg2 = Segmentation(
shape=(3008, 3008), dtype="uint8", names=["apple", "orange", "banana"]
)

text1 = "Segmentation(shape=(3008, 3008), dtype='uint8', num_classes=5)"
text2 = "Segmentation(shape=(3008, 3008), dtype='uint8', names=['apple', 'orange', 'banana'], num_classes=3)"
assert seg1.__repr__() == text1
assert seg2.__repr__() == text2


def test_classlabel_repr():
cl1 = ClassLabel(num_classes=5)
cl2 = ClassLabel(names=["apple", "orange", "banana"])

text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
assert cl1.__repr__() == text1
assert cl2.__repr__() == text2


if __name__ == "__main__":
test_load_names_from_file()
test_class_label()
test_hub_feature_flatten()
test_feature_dict_str()
test_feature_dict_repr()
test_classlabel_repr()
test_segmentation_repr()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from setuptools import find_packages, setup

project = "hub"
VERSION = "1.0.4"
VERSION = "1.0.6"

this_directory = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(this_directory, "README.md")) as f:
Expand Down

0 comments on commit 831fb70

Please sign in to comment.