Merge branch 'master' into LabelsUpdate

activeloopai · Dec 30, 2020 · 32ae235 · 32ae235
2 parents 5c7ccde + 6f34b14
commit 32ae235
Show file tree

Hide file tree

Showing 43 changed files with 3,342 additions and 497 deletions.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -3,7 +3,7 @@ name: Feature request
 about: Suggest an idea for this project
 title: "[FEATURE]"
 labels: "i: enhancement, i: needs triage"
-assignees: ""
+assignees: ''
 
 ---
 

diff --git a/.gitignore b/.gitignore
@@ -185,4 +185,8 @@ logs/
 .creds/
 .idea/
 waymo/
-output/
+output/
+cov.xml
+hub/api/cov.xml
+hub/api/nested_seq
+nested_seq
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
  <p align="center">
     <br>
-    <img src="https://raw.githubusercontent.com/activeloopai/Hub/master/docs/logo/hub_logo_explainer.png" width="50%"/>
+    <img src="https://raw.githubusercontent.com/activeloopai/Hub/master/docs/logo/logo-explainer-bg.png" width="50%"/>
     </br>
 </p>
 
@@ -164,9 +164,17 @@ As always, thanks to our amazing contributors!     </br>
 
 
 ## Examples
-Activeloop’s Hub format lets you achieve faster inference at a lower cost. Test out the datasets we’ve converted into Hub format - see for yourself!
-- [Waymo Open Dataset](https://medium.com/snarkhub/extending-snark-hub-capabilities-to-handle-waymo-open-dataset-4dc7b7d8ab35)
-- [Aptiv nuScenes](https://medium.com/snarkhub/snark-hub-is-hosting-nuscenes-dataset-for-autonomous-driving-1470ae3e1923)
+Activeloop’s Hub format lets you achieve faster inference at a lower cost. We have 30+ popular datasets already on our platform. These include:-
+- COCO
+- CIFAR-10
+- PASCAL VOC
+- Cars196
+- KITTI
+- EuroSAT 
+- Caltech-UCSD Birds 200
+- Food101
+
+Check these and many more popular datasets on our [visualizer web app](https://app.activeloop.ai/datasets/popular) and load them directly for model training!
 
 
 ## Disclaimers

diff --git a/benchmarks/benchmark_to_pytorch.py b/benchmarks/benchmark_to_pytorch.py
@@ -0,0 +1,53 @@
+import torchvision
+import torch
+import numpy as np
+
+import hub
+from hub.utils import Timer
+
+
+class HubAdapter2(torch.utils.data.Dataset):
+    def __init__(self, ods):
+        self.ds = ods
+
+    def __len__(self):
+        return min(len(self.ds), 1000 * 1000)
+
+    @property
+    def shape(self):
+        return (self.ds.__len__(), None, None, None)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+    def __getitem__(self, index):
+        x, y = self.ds.__getitem__(index)
+        res = {"image": np.array(x), "label": y}
+        return res
+
+
+def test():
+    tv_cifar_ds = torchvision.datasets.CIFAR10(".", download=True)
+
+    hub_cifar = HubAdapter2(tv_cifar_ds)
+
+    pt2hb_ds = hub.Dataset.from_pytorch(hub_cifar, scheduler="threaded", workers=8)
+    res_ds = pt2hb_ds.store("./data/test/cifar/train")
+    hub_s3_ds = hub.Dataset(
+        url="./data/test/cifar/train", cache=False, storage_cache=False
+    )
+    print(hub_s3_ds._tensors["/image"].chunks)
+    hub_s3_ds = hub_s3_ds.to_pytorch()
+    dl = torch.utils.data.DataLoader(hub_s3_ds, batch_size=100, num_workers=8)
+    with Timer("Time"):
+        counter = 0
+        for i, b in enumerate(dl):
+            with Timer("Batch Time"):
+                x, y = b["image"], b["image"]
+                counter += 100
+                print(counter)
+
+
+if __name__ == "__main__":
+    test()
diff --git a/docs/logo/logo-explainer-bg.png b/docs/logo/logo-explainer-bg.png
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,4 +1,4 @@
 sphinx==3.1.2
 sphinx_markdown_tables==0.0.15
-recommonmark==0.6.0
+recommonmark==0.7.1
 sphinx_rtd_theme==0.5.0
diff --git a/examples/mpii_data_upload_example.py b/examples/mpii_data_upload_example.py
@@ -0,0 +1,99 @@
+import json
+import time
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+import hub
+from hub import Dataset, schema
+from hub.schema import Tensor, Text
+
+"""
+Below we will define a schema for our dataset. Schema is kind of
+a container to specify structure, shape, dtype and meta information
+of our dataset. We have different types of schemas for different
+types of data like image, tensor, text. More info. on docs.
+"""
+mpii_schema = {
+    """
+    we specify 'shape' as None for variable image size, and we
+    give 'max_shape' arguement a maximum possible size of image.
+    """
+    "image": schema.Image(
+        shape=(None, None, 3), max_shape=(1920, 1920, 3), dtype="uint8"
+    ),
+    "isValidation": "float64",
+    "img_paths": Text(shape=(None,), max_shape=(15,)),
+    "img_width": "int32",
+    "img_height": "int32",
+    "objpos": Tensor(max_shape=(100,), dtype="float64"),
+    """
+    'joint_self' has nested list structure
+    """
+    "joint_self": Tensor(shape=(None, None), max_shape=(100, 100), dtype="float64"),
+    "scale_provided": "float64",
+    "annolist_index": "int32",
+    "people_index": "int32",
+    "numOtherPeople": "int32",
+}
+
+
+"""
+Below function takes JSON file and gives annotations in the
+form of dictionary inside list.
+"""
+
+
+def get_anno(jsonfile):
+
+    with open(jsonfile) as f:
+        instances = json.load(f)
+
+    annotations = []
+    for i in range(len(instances)):
+        annotations.append(instances[i])
+    return annotations
+
+
+"""
+Hub Transform is optimized to give efficient processing and
+storing of dataset. Below function takes a dataset and applies
+transform on every sample(instance) of dataset, and outputs a
+dataset with specified schema. More info. on docs.
+"""
+
+
+@hub.transform(schema=mpii_schema, workers=8)
+def mpii_transform(annotation):
+    return {
+        "image": np.array(Image.open(img_path + annotation["img_paths"])),
+        "isValidation": np.array(annotation["isValidation"]),
+        "img_paths": annotation["img_paths"],
+        "img_width": np.array(annotation["img_width"]),
+        "img_height": np.array(annotation["img_height"]),
+        "objpos": np.array(annotation["objpos"]),
+        "joint_self": np.array(annotation["joint_self"]),
+        "scale_provided": np.array(annotation["scale_provided"]),
+        "annolist_index": np.array(annotation["annolist_index"]),
+        "people_index": np.array(annotation["people_index"]),
+        "numOtherPeople": np.array(annotation["numOtherPeople"]),
+    }
+
+
+if __name__ == "__main__":
+
+    tag = input("Enter tag(username/dataset_name):")
+    jsonfile = input("Enter json file path:")
+    img_path = input("Enter path to images:")
+
+    annotations = get_anno(jsonfile)
+
+    t1 = time.time()
+    ds = mpii_transform(annotations)
+    ds = ds.store(tag)
+    print("Time taken to upload:", (time.time() - t1), "sec")
+
+"""
+Dataset uploaded using AWS EC2. Pipeline took 8931.26 sec to
+finish. Dataset is visible on app and tested working fine.
+"""
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
@@ -0,0 +1,2 @@
+# A Gentle Introduction to Hub
+A collection of tutorials for [Hub](https://github.com/activeloopai/hub). It starts off by working with [different types](https://docs.activeloop.ai/en/latest/concepts/features.html#available-schemas) of data (eg images, audio), and then moves on to more complicated concepts like dynamic tensors.
diff --git a/examples/tutorial/Tutorial 1a - Uploading Images.ipynb b/examples/tutorial/Tutorial 1a - Uploading Images.ipynb
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# A Gentle Introduction to Hub
		A collection of tutorials for [Hub](https://github.com/activeloopai/hub). It starts off by working with [different types](https://docs.activeloop.ai/en/latest/concepts/features.html#available-schemas) of data (eg images, audio), and then moves on to more complicated concepts like dynamic tensors.