Merge branch 'master' into features/pipy_fix

activeloopai · Dec 16, 2020 · 831fb70 · 831fb70
2 parents 303e94e + 775f8a5
commit 831fb70
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 20 deletions.
diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
@@ -22,7 +22,7 @@ def __init__(
             The number of samples in this DatasetView
         offset: int
             The offset from which the DatasetView starts
-        squuze_dim: bool
+        squeeze_dim: bool
             For slicing with integers we would love to remove the first dimension to make it nicer
         """
         if dataset is None:

diff --git a/hub/compute/transform.py b/hub/compute/transform.py
@@ -18,11 +18,10 @@
 from hub.defaults import OBJECT_CHUNK
 
 
-def get_sample_size_in_memory(schema):
-    """Given Schema, looks into memory how many samples can fit and returns it"""
+def get_sample_size(schema, workers):
+    """Given Schema, decides how many samples to take at once and returns it"""
     schema = featurify(schema)
-    mem = virtual_memory()
-    sample_size = 0
+    samples = 10000
     for feature in schema._flatten():
         shp = list(feature.max_shape)
         if len(shp) == 0:
@@ -38,11 +37,8 @@ def prod(shp):
                 res *= s
             return res
 
-        sample_size += prod(shp) * sz
-
-    if sample_size > mem.available:
-        return 1
-    return int((mem.available // sample_size) * 0.8)
+        samples = min(samples, (16 * 1024 * 1024 * 8) // (prod(shp) * sz))
+    return samples * workers
 
 
 class Transform:
@@ -335,12 +331,9 @@ def store(
 
         # compute shard length
         if sample_per_shard is None:
-            n_samples = get_sample_size_in_memory(self.schema)
-            n_samples = min(10000, n_samples)
-            n_samples = max(512, n_samples)
+            n_samples = get_sample_size(self.schema, self.workers)
         else:
             n_samples = sample_per_shard
-
         try:
             length = len(ds_in) if hasattr(ds_in, "__len__") else n_samples
         except Exception:
@@ -372,11 +365,10 @@ def batchify_generator(iterator: Iterable, size: int):
             for ds_in_shard in batchify_generator(ds_in, n_samples):
                 n_results = self.store_shard(ds_in_shard, ds_out, start, token=token)
                 total += n_results
-
+                pbar.update(n_results)
                 if n_results < n_samples or n_results == 0:
                     break
                 start += n_samples
-                pbar.update(n_samples)
 
         ds_out.resize_shape(total)
         ds_out.commit()

diff --git a/hub/schema/segmentation.py b/hub/schema/segmentation.py
@@ -63,10 +63,14 @@ def get_attr_dict(self):
     def __str__(self):
         out = super().__str__()
         out = "Segmentation" + out[6:-1]
-        out = out + ", names=" + self.names if self.names is not None else out
         out = (
-            out + ", num_classes=" + self.num_classes
-            if self.num_classes is not None
+            out + ", names=" + str(self.class_labels._names)
+            if self.class_labels._names is not None
+            else out
+        )
+        out = (
+            out + ", num_classes=" + str(self.class_labels._num_classes)
+            if self.class_labels._num_classes is not None
             else out
         )
         out += ")"

diff --git a/hub/schema/tests/test_features.py b/hub/schema/tests/test_features.py
@@ -1,3 +1,4 @@
+from hub.schema import Segmentation
 from hub.schema.class_label import ClassLabel, _load_names_from_file
 from hub.schema.features import HubSchema, SchemaDict
 import pytest
@@ -49,9 +50,33 @@ def test_feature_dict_repr():
     assert expected_output == feature_dict_object.__repr__()
 
 
+def test_segmentation_repr():
+    seg1 = Segmentation(shape=(3008, 3008), dtype="uint8", num_classes=5)
+    seg2 = Segmentation(
+        shape=(3008, 3008), dtype="uint8", names=["apple", "orange", "banana"]
+    )
+
+    text1 = "Segmentation(shape=(3008, 3008), dtype='uint8', num_classes=5)"
+    text2 = "Segmentation(shape=(3008, 3008), dtype='uint8', names=['apple', 'orange', 'banana'], num_classes=3)"
+    assert seg1.__repr__() == text1
+    assert seg2.__repr__() == text2
+
+
+def test_classlabel_repr():
+    cl1 = ClassLabel(num_classes=5)
+    cl2 = ClassLabel(names=["apple", "orange", "banana"])
+
+    text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
+    text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
+    assert cl1.__repr__() == text1
+    assert cl2.__repr__() == text2
+
+
 if __name__ == "__main__":
     test_load_names_from_file()
     test_class_label()
     test_hub_feature_flatten()
     test_feature_dict_str()
     test_feature_dict_repr()
+    test_classlabel_repr()
+    test_segmentation_repr()
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 from setuptools import find_packages, setup
 
 project = "hub"
-VERSION = "1.0.4"
+VERSION = "1.0.6"
 
 this_directory = os.path.abspath(os.path.dirname(__file__))
 with open(os.path.join(this_directory, "README.md")) as f: