mlcommons · ccl-core · Feb 7, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 6, 2024
@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -25,6 +25,7 @@
     "format": "cr:format",
     "includes": "cr:includes",
     "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "cr:jsonPath",
     "key": "cr:key",
     "md5": "cr:md5",

@@ -18,6 +18,7 @@
     "format": "ml:format",
     "includes": "ml:includes",
     "isEnumeration": "ml:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "ml:jsonPath",
     "ml": "http://mlcommons.org/schema/",
     "parentField": "ml:parentField",

@@ -25,6 +25,7 @@
     "format": "ml:format",
     "includes": "ml:includes",
     "isEnumeration": "ml:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
     "jsonPath": "ml:jsonPath",
     "key": "sc:key",
     "md5": "sc:md5",

@@ -39,6 +39,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
 ML_COMMONS_FORMAT = lambda ctx: ML_COMMONS(ctx)["format"]
 ML_COMMONS_INCLUDES = lambda ctx: ML_COMMONS(ctx).includes
 ML_COMMONS_IS_ENUMERATION = lambda ctx: ML_COMMONS(ctx).isEnumeration
+ML_COMMONS_IS_LIVE_DATASET = lambda ctx: ML_COMMONS(ctx).isLiveDataset
 ML_COMMONS_JSON_PATH = lambda ctx: ML_COMMONS(ctx).jsonPath
 ML_COMMONS_PARENT_FIELD = lambda ctx: ML_COMMONS(ctx).parentField
 ML_COMMONS_PATH = lambda ctx: ML_COMMONS(ctx).path
@@ -106,6 +107,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
     ML_COMMONS_FILE_PROPERTY(ctx): "file_property",
     ML_COMMONS_FORMAT(ctx): "format",
     ML_COMMONS_INCLUDES(ctx): "includes",
+    ML_COMMONS_IS_LIVE_DATASET(ctx): "is_live_dataset",
     ML_COMMONS_JSON_PATH(ctx): "json_path",
     ML_COMMONS_REFERENCES(ctx): "references",
     ML_COMMONS_REGEX(ctx): "regex",

@@ -74,6 +74,7 @@ class Context:
         default_factory=dict, hash=False
     )
     conforms_to: CroissantVersion = CroissantVersion.V_1_0
+    is_live_dataset: bool | None = None
 
     def __post_init__(self):
         """Standardizes conforms_to."""

@@ -43,6 +43,7 @@ def test_make_context():
         "format": "cr:format",
         "includes": "cr:includes",
         "isEnumeration": "cr:isEnumeration",
+        "isLiveDataset": "cr:isLiveDataset",
         "jsonPath": "cr:jsonPath",
         "key": "cr:key",
         "md5": "cr:md5",

@@ -38,6 +38,7 @@ def make_context(ctx=None, **kwargs):
         "format": "cr:format",
         "includes": "cr:includes",
         "isEnumeration": "cr:isEnumeration",
+        "isLiveDataset": "cr:isLiveDataset",
         "jsonPath": "cr:jsonPath",
         "key": "sc:key" if ctx is not None and ctx.is_v0() else "cr:key",
         "md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",

@@ -63,6 +63,7 @@ def __post_init__(self):
             self.metadata = Metadata.from_file(ctx=ctx, file=self.jsonld)
         else:
             return
+        ctx.is_live_dataset = self.metadata.is_live_dataset
         # Draw the structure graph for debugging purposes.
         if self.debug:
             graphs_utils.pretty_print_graph(ctx.graph, simplify=True)

@@ -14,7 +14,6 @@
 
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.constants import EncodingFormat
-from mlcroissant._src.core.context import CroissantVersion
 from mlcroissant._src.core.optional import deps
 from mlcroissant._src.core.path import get_fullpath
 from mlcroissant._src.core.path import Path
@@ -168,9 +167,17 @@ def _check_hash(self, filepath: epath.Path):
         logging.info(
             "Hash of downloaded file is not identical with reference in metadata.json"
         )
-        # In v0.8 only, hashes were not checked.
+
         ctx = self.node.ctx
-        if ctx.conforms_to and ctx.conforms_to > CroissantVersion.V_0_8:
+        # For live datasets, we do not raise an error if the hashes checks fail, but
+        # only a warning.
+        if ctx.is_live_dataset:
+            logging.warning(
+                "Hash of downloaded file not identical with reference in metadata.json!"
+            )
+            return
+        # In v0.8 only, hashes were not checked.
+        if not ctx.is_v0():
             raise ValueError(
                 f"Hash of downloaded file {filepath} is not identical with the"
                 f" reference in the Croissant JSON-LD. Expected: {expected_hash} -"

@@ -1,6 +1,7 @@
 """download_test module."""
 
 import hashlib
+import logging
 import os
 import tempfile
 
@@ -151,14 +152,38 @@ def test_sha256_hashes_do_match(conforms_to, hash_value):
         file_object = create_test_file_object(
             name="foo",
             content_url=os.fspath(filepath),
-            # Hash will match!
+            # Hash won't match!
             sha256=hash_value,
         )
         file_object.parents = [metadata]
         download = Download(operations=operations(), node=file_object)
         download()
 
 
+def test_hashes_are_not_checked_for_live_datasets(caplog):
+    logging.captureWarnings(True)
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        filepath = f.name
+        ctx = Context(
+            conforms_to=CroissantVersion.V_1_0,
+            folder=epath.Path(),
+            is_live_dataset=True,
+        )
+        metadata = Metadata(ctx=ctx, name="bar")
+        file_object = create_test_file_object(
+            ctx=ctx,
+            name="foo",
+            content_url=os.fspath(filepath),
+            # Hash won't match, but no error raised!
+            sha256="12345",
+        )
+        file_object.parents = [metadata]
+        download = Download(operations=operations(), node=file_object)
+        # Warning is raised, but no error.
+        download()
+        assert "Hash of downloaded file not identical" in caplog.text
+
+
 @pytest.mark.parametrize("conforms_to", CroissantVersion)
 # Test the hex and base64 hash values
 @pytest.mark.parametrize(

@@ -33,7 +33,8 @@ def __post_init__(self):
         self.assert_has_mandatory_properties("encoding_format", "name")
         if not self.contained_in:
             self.assert_has_mandatory_properties("content_url")
-            self.assert_has_exclusive_properties(["md5", "sha256"])
+            if self.ctx and not self.ctx.is_live_dataset:
+                self.assert_has_exclusive_properties(["md5", "sha256"])
 
     def to_json(self) -> Json:
         """Converts the `FileObject` to JSON."""

@@ -30,6 +30,25 @@ def test_checks_are_performed():
         validate_name_mock.assert_called_once()
 
 
+def test_checks_not_performed_for_live_dataset():
+    with mock.patch.object(
+        Node, "assert_has_mandatory_properties"
+    ) as mandatory_mock, mock.patch.object(
+        Node, "assert_has_optional_properties"
+    ), mock.patch.object(
+        Node, "validate_name"
+    ) as validate_name_mock, mock.patch.object(
+        Node, "assert_has_exclusive_properties"
+    ) as exclusive_mock:
+        ctx = Context(is_live_dataset=True)
+        create_test_node(FileObject, ctx=ctx)
+        mandatory_mock.assert_has_calls([
+            mock.call("encoding_format", "name"), mock.call("content_url")
+        ])
+        exclusive_mock.assert_not_called()
+        validate_name_mock.assert_called_once()
+
+
 @pytest.mark.parametrize(
     ["encoding"],
     [