Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add isLiveDataset field #494

Merged
merged 16 commits into from
Feb 7, 2024
1 change: 1 addition & 0 deletions datasets/1.0/bigcode-the-stack/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/coco2014-mini/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/coco2014/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/fashion-mnist/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/flores-200/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/gpt-3/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/huggingface-c4/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/huggingface-mnist/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/movielens/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/pass-mini/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/pass/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/compressed_archive.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/enum.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/file_object_in_zip.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/minimal.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/minimal_recommended.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/read_binary_file_by_line.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/read_from_directory.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/read_from_tar.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/recipes/simple-split.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/simple-dataset/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/simple-join/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/simple-parquet/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/titanic/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/wiki-text/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions datasets/1.0/world-happiness/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions editor/cypress/fixtures/coco.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"format": "ml:format",
"includes": "ml:includes",
"isEnumeration": "ml:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "ml:jsonPath",
"ml": "http://mlcommons.org/schema/",
"parentField": "ml:parentField",
Expand Down
1 change: 1 addition & 0 deletions editor/cypress/fixtures/titanic.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"format": "ml:format",
"includes": "ml:includes",
"isEnumeration": "ml:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "ml:jsonPath",
"key": "sc:key",
"md5": "sc:md5",
Expand Down
2 changes: 2 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
ML_COMMONS_FORMAT = lambda ctx: ML_COMMONS(ctx)["format"]
ML_COMMONS_INCLUDES = lambda ctx: ML_COMMONS(ctx).includes
ML_COMMONS_IS_ENUMERATION = lambda ctx: ML_COMMONS(ctx).isEnumeration
ML_COMMONS_IS_LIVE_DATASET = lambda ctx: ML_COMMONS(ctx).isLiveDataset
ML_COMMONS_JSON_PATH = lambda ctx: ML_COMMONS(ctx).jsonPath
ML_COMMONS_PARENT_FIELD = lambda ctx: ML_COMMONS(ctx).parentField
ML_COMMONS_PATH = lambda ctx: ML_COMMONS(ctx).path
Expand Down Expand Up @@ -106,6 +107,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
ML_COMMONS_FILE_PROPERTY(ctx): "file_property",
ML_COMMONS_FORMAT(ctx): "format",
ML_COMMONS_INCLUDES(ctx): "includes",
ML_COMMONS_IS_LIVE_DATASET(ctx): "is_live_dataset",
ML_COMMONS_JSON_PATH(ctx): "json_path",
ML_COMMONS_REFERENCES(ctx): "references",
ML_COMMONS_REGEX(ctx): "regex",
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Context:
default_factory=dict, hash=False
)
conforms_to: CroissantVersion = CroissantVersion.V_1_0
is_live_dataset: bool | None = None

def __post_init__(self):
"""Standardizes conforms_to."""
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/json_ld_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def test_make_context():
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def make_context(ctx=None, **kwargs):
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "sc:key" if ctx is not None and ctx.is_v0() else "cr:key",
"md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __post_init__(self):
self.metadata = Metadata.from_file(ctx=ctx, file=self.jsonld)
else:
return
ctx.is_live_dataset = self.metadata.is_live_dataset
# Draw the structure graph for debugging purposes.
if self.debug:
graphs_utils.pretty_print_graph(ctx.graph, simplify=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from mlcroissant._src.core import constants
from mlcroissant._src.core.constants import EncodingFormat
from mlcroissant._src.core.context import CroissantVersion
from mlcroissant._src.core.optional import deps
from mlcroissant._src.core.path import get_fullpath
from mlcroissant._src.core.path import Path
Expand Down Expand Up @@ -168,9 +167,17 @@ def _check_hash(self, filepath: epath.Path):
logging.info(
"Hash of downloaded file is not identical with reference in metadata.json"
)
# In v0.8 only, hashes were not checked.

ctx = self.node.ctx
if ctx.conforms_to and ctx.conforms_to > CroissantVersion.V_0_8:
# For live datasets, we do not raise an error if the hashes checks fail, but
# only a warning.
if ctx.is_live_dataset:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a unit test in download_test.py? Something like:

@pytest.mark.parametrize("conforms_to", CroissantVersion)
# Test the hex and base64 hash values
def test_hashes_are_not_checked_for_live_datasets(conforms_to):
    with tempfile.NamedTemporaryFile(delete=False) as f:
        filepath = f.name
        ctx = Context(conforms_to=conforms_to, is_live_dataset=True)
        metadata = Metadata(ctx=ctx, name="bar")
        file_object = create_test_file_object(
            name="foo",
            content_url=os.fspath(filepath),
        )
        file_object.parents = [metadata]
        download = Download(operations=operations(), node=file_object)
        download()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great point, done.

logging.warning(
"Hash of downloaded file not identical with reference in metadata.json!"
)
return
# In v0.8 only, hashes were not checked.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of the comment, use not ctx.is_v0()? (Probably this line is anterior to the creation of the function)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I just moved the pre-existing line below. I think ctx.is_v0() would do the same.

if not ctx.is_v0():
raise ValueError(
f"Hash of downloaded file {filepath} is not identical with the"
f" reference in the Croissant JSON-LD. Expected: {expected_hash} -"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""download_test module."""

import hashlib
import logging
import os
import tempfile

Expand Down Expand Up @@ -151,14 +152,38 @@ def test_sha256_hashes_do_match(conforms_to, hash_value):
file_object = create_test_file_object(
name="foo",
content_url=os.fspath(filepath),
# Hash will match!
# Hash won't match!
sha256=hash_value,
)
file_object.parents = [metadata]
download = Download(operations=operations(), node=file_object)
download()


def test_hashes_are_not_checked_for_live_datasets(caplog):
logging.captureWarnings(True)
with tempfile.NamedTemporaryFile(delete=False) as f:
filepath = f.name
ctx = Context(
conforms_to=CroissantVersion.V_1_0,
folder=epath.Path(),
is_live_dataset=True,
)
metadata = Metadata(ctx=ctx, name="bar")
file_object = create_test_file_object(
ctx=ctx,
name="foo",
content_url=os.fspath(filepath),
# Hash won't match, but no error raised!
sha256="12345",
)
file_object.parents = [metadata]
download = Download(operations=operations(), node=file_object)
# Warning is raised, but no error.
download()
assert "Hash of downloaded file not identical" in caplog.text


@pytest.mark.parametrize("conforms_to", CroissantVersion)
# Test the hex and base64 hash values
@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def __post_init__(self):
self.assert_has_mandatory_properties("encoding_format", "name")
if not self.contained_in:
self.assert_has_mandatory_properties("content_url")
self.assert_has_exclusive_properties(["md5", "sha256"])
if self.ctx and not self.ctx.is_live_dataset:
self.assert_has_exclusive_properties(["md5", "sha256"])

def to_json(self) -> Json:
"""Converts the `FileObject` to JSON."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,25 @@ def test_checks_are_performed():
validate_name_mock.assert_called_once()


def test_checks_not_performed_for_live_dataset():
with mock.patch.object(
Node, "assert_has_mandatory_properties"
) as mandatory_mock, mock.patch.object(
Node, "assert_has_optional_properties"
), mock.patch.object(
Node, "validate_name"
) as validate_name_mock, mock.patch.object(
Node, "assert_has_exclusive_properties"
) as exclusive_mock:
ctx = Context(is_live_dataset=True)
create_test_node(FileObject, ctx=ctx)
mandatory_mock.assert_has_calls([
mock.call("encoding_format", "name"), mock.call("content_url")
])
exclusive_mock.assert_not_called()
validate_name_mock.assert_called_once()


@pytest.mark.parametrize(
["encoding"],
[
Expand Down
Loading