Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate --file to --jsonld to align with the API. #485

Merged
merged 1 commit into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ jobs:
for file in ${JSON_FILES}
do
echo "Validating ${file}..."
python mlcroissant/scripts/validate.py --file ${file}
python mlcroissant/scripts/validate.py --jsonld ${file}
done

generation-test:
Expand All @@ -220,10 +220,10 @@ jobs:
run: pip install .

- name: Generate JSON-LD files - Titanic
run: mlcroissant load --file ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers
run: mlcroissant load --jsonld ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers

- name: Generate JSON-LD files - PASS
run: pip install .[image] && python mlcroissant/scripts/load.py --file ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images
run: pip install .[image] && python mlcroissant/scripts/load.py --jsonld ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images

editor-python-test:
name: Editor Python Test
Expand Down
6 changes: 3 additions & 3 deletions python/mlcroissant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ sudo apt-get install python3-dev graphviz libgraphviz-dev pkg-config
## Verify/load a Croissant dataset

```bash
mlcroissant validate --file ../../datasets/titanic/metadata.json
mlcroissant validate --jsonld ../../datasets/titanic/metadata.json
```

The command:
Expand All @@ -48,7 +48,7 @@ Similarly, you can generate a dataset by launching:

```bash
mlcroissant load \
--file ../../datasets/titanic/metadata.json \
--jsonld ../../datasets/titanic/metadata.json \
--record_set passengers \
--num_records 10
```
Expand Down Expand Up @@ -123,7 +123,7 @@ Alternatively, you can contribute to `mlcroissant` using the "classic" GitHub wo
You can debug the validation of the file using the `--debug` flag:

```bash
mlcroissant validate --file ../../datasets/titanic/metadata.json --debug
mlcroissant validate --jsonld ../../datasets/titanic/metadata.json --debug
```

This will:
Expand Down
4 changes: 2 additions & 2 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def load_records_and_test_equality(
):
print(
"If this test fails, update JSONL with: `mlcroissant load"
f" --file ../../datasets/{version}/{dataset_name} --record_set"
f" --jsonld ../../datasets/{version}/{dataset_name} --record_set"
f" {record_set_name} --num_records {num_records} --debug --update_output`"
)
config = (
Expand Down Expand Up @@ -85,7 +85,7 @@ def load_records_and_test_equality(
# You can regenerate .pkl files by launching
# ```bash
# mlcroissant load \
# --file ../../datasets/{{version}}/{{dataset_name}}/metadata.json \
# --jsonld ../../datasets/{{version}}/{{dataset_name}}/metadata.json \
# --record_set {{record_set_name}} \
# --update_output \
# --num_records -1
Expand Down
33 changes: 22 additions & 11 deletions python/mlcroissant/mlcroissant/scripts/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from absl import app
from absl import flags
from absl import logging
from etils import epath

import mlcroissant as mlc
Expand All @@ -14,9 +15,17 @@


flags.DEFINE_string(
"file",
"jsonld",
None,
"Path to the file to validate.",
"JSON-LD to validate (path to the file or URL).",
required=True,
)

flags.DEFINE_string(
"file",
"",
"[DEPRECATED] Path to the file to validate.",
required=False,
)

flags.DEFINE_string(
Expand Down Expand Up @@ -52,7 +61,7 @@
' "~/Downloads/document.csv"}\'`.',
)

flags.mark_flag_as_required("file")
flags.mark_flag_as_required("jsonld")


FLAGS = flags.FLAGS
Expand All @@ -61,14 +70,16 @@
def main(argv):
"""Main function launched by the script."""
del argv
file = FLAGS.file
if FLAGS.file:
logging.warning("--file is deprecated. Please, use --jsonld with a path or URL")
jsonld = FLAGS.jsonld or FLAGS.file
record_set = FLAGS.record_set
num_records = FLAGS.num_records
debug = FLAGS.debug
update_output = FLAGS.update_output
mapping = FLAGS.mapping
return load(
file=file,
jsonld=jsonld,
record_set=record_set,
num_records=num_records,
debug=debug,
Expand All @@ -78,7 +89,7 @@ def main(argv):


def load(
file: str,
jsonld: str,
record_set: str | None,
num_records: int = _NUM_MAX_RECORDS,
debug: bool = False,
Expand All @@ -93,25 +104,25 @@ def load(
file_mapping = json.loads(mapping)
except json.JSONDecodeError as e:
raise ValueError("--mapping should be a valid dict[str, str]") from e
dataset = mlc.Dataset(file, debug=debug, mapping=file_mapping)
dataset = mlc.Dataset(jsonld, debug=debug, mapping=file_mapping)
if record_set is None:
record_sets = ", ".join([f"`{rs.name}`" for rs in dataset.metadata.record_sets])
raise ValueError(f"--record_set flag should have a value in {record_sets}")
records = dataset.records(record_set)
generate_all_records = num_records == -1
if generate_all_records:
print(f"Generating all records from {file}.")
print(f"Generating all records from {jsonld}.")
else:
print(f"Generating the first {num_records} records from {file}.")
print(f"Generating the first {num_records} records from {jsonld}.")
output_records = []
for i, record in enumerate(records):
if not generate_all_records and i >= num_records:
break
print(record)
output_records.append(record_to_python(record))
print("Done.")
if update_output:
output_folder = epath.Path(file).parent / "output"
if update_output and not jsonld.startswith("http"):
output_folder = epath.Path(jsonld).parent / "output"
if not output_folder.exists():
output_folder.mkdir()
output_file = output_folder / f"{record_set}.jsonl"
Expand Down
4 changes: 2 additions & 2 deletions python/mlcroissant/mlcroissant/scripts/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_should_raise_when_no_record_set(version):
with pytest.raises(
ValueError, match="--record_set flag should have a value in `default`"
):
load_lib.load(file=file, record_set=None)
load_lib.load(jsonld=file, record_set=None)


@parametrize_version()
Expand All @@ -35,4 +35,4 @@ def test_should_raise_when_invalid_mapping(version):
with pytest.raises(
ValueError, match="--mapping should be a valid dict\\[str, str\\]"
):
load_lib.load(file=file, record_set="default", mapping="foobarbaz")
load_lib.load(jsonld=file, record_set="default", mapping="foobarbaz")
22 changes: 16 additions & 6 deletions python/mlcroissant/mlcroissant/scripts/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Usage:

```
mlcroissant validate --file /path/to/file.json
mlcroissant validate --jsonld /path/to/file.json
```
"""

Expand All @@ -16,9 +16,17 @@
import mlcroissant as mlc

flags.DEFINE_string(
"file",
"jsonld",
None,
"Path to the file to validate.",
"JSON-LD to validate (path to the file or URL).",
required=True,
)

flags.DEFINE_string(
"file",
"",
"[DEPRECATED] Path to the file to validate.",
required=False,
)

flags.DEFINE_bool(
Expand All @@ -27,7 +35,7 @@
"Whether to print debug hints.",
)

flags.mark_flag_as_required("file")
flags.mark_flag_as_required("jsonld")


FLAGS = flags.FLAGS
Expand All @@ -36,10 +44,12 @@
def main(argv):
"""Main function launched by the script."""
del argv
file = FLAGS.file
if FLAGS.file:
logging.warning("--file is deprecated. Please, use --jsonld with a path or URL")
jsonld = FLAGS.jsonld or FLAGS.file
debug = FLAGS.debug
try:
mlc.Dataset(file, debug=debug)
mlc.Dataset(jsonld, debug=debug)
logging.info("Done.")
except mlc.ValidationError as exception:
logging.error(exception)
Expand Down