diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a01a0645a..fbe6ac655 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -196,7 +196,7 @@ jobs: for file in ${JSON_FILES} do echo "Validating ${file}..." - python mlcroissant/scripts/validate.py --file ${file} + python mlcroissant/scripts/validate.py --jsonld ${file} done generation-test: @@ -220,10 +220,10 @@ jobs: run: pip install . - name: Generate JSON-LD files - Titanic - run: mlcroissant load --file ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers + run: mlcroissant load --jsonld ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers - name: Generate JSON-LD files - PASS - run: pip install .[image] && python mlcroissant/scripts/load.py --file ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images + run: pip install .[image] && python mlcroissant/scripts/load.py --jsonld ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images editor-python-test: name: Editor Python Test diff --git a/python/mlcroissant/README.md b/python/mlcroissant/README.md index d0612aac2..860934ed2 100644 --- a/python/mlcroissant/README.md +++ b/python/mlcroissant/README.md @@ -36,7 +36,7 @@ sudo apt-get install python3-dev graphviz libgraphviz-dev pkg-config ## Verify/load a Croissant dataset ```bash -mlcroissant validate --file ../../datasets/titanic/metadata.json +mlcroissant validate --jsonld ../../datasets/titanic/metadata.json ``` The command: @@ -48,7 +48,7 @@ Similarly, you can generate a dataset by launching: ```bash mlcroissant load \ - --file ../../datasets/titanic/metadata.json \ + --jsonld ../../datasets/titanic/metadata.json \ --record_set passengers \ --num_records 10 ``` @@ -123,7 +123,7 @@ Alternatively, you can contribute to `mlcroissant` using the "classic" GitHub wo You can debug the validation of the file using the `--debug` flag: ```bash -mlcroissant validate --file ../../datasets/titanic/metadata.json --debug +mlcroissant validate --jsonld ../../datasets/titanic/metadata.json --debug ``` This will: diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index 6113287fb..6999c9b37 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -55,7 +55,7 @@ def load_records_and_test_equality( ): print( "If this test fails, update JSONL with: `mlcroissant load" - f" --file ../../datasets/{version}/{dataset_name} --record_set" + f" --jsonld ../../datasets/{version}/{dataset_name} --record_set" f" {record_set_name} --num_records {num_records} --debug --update_output`" ) config = ( @@ -85,7 +85,7 @@ def load_records_and_test_equality( # You can regenerate .pkl files by launching # ```bash # mlcroissant load \ -# --file ../../datasets/{{version}}/{{dataset_name}}/metadata.json \ +# --jsonld ../../datasets/{{version}}/{{dataset_name}}/metadata.json \ # --record_set {{record_set_name}} \ # --update_output \ # --num_records -1 diff --git a/python/mlcroissant/mlcroissant/scripts/load.py b/python/mlcroissant/mlcroissant/scripts/load.py index c3a7ad65c..81766f3c4 100644 --- a/python/mlcroissant/mlcroissant/scripts/load.py +++ b/python/mlcroissant/mlcroissant/scripts/load.py @@ -5,6 +5,7 @@ from absl import app from absl import flags +from absl import logging from etils import epath import mlcroissant as mlc @@ -14,9 +15,17 @@ flags.DEFINE_string( - "file", + "jsonld", None, - "Path to the file to validate.", + "JSON-LD to validate (path to the file or URL).", + required=True, +) + +flags.DEFINE_string( + "file", + "", + "[DEPRECATED] Path to the file to validate.", + required=False, ) flags.DEFINE_string( @@ -52,7 +61,7 @@ ' "~/Downloads/document.csv"}\'`.', ) -flags.mark_flag_as_required("file") +flags.mark_flag_as_required("jsonld") FLAGS = flags.FLAGS @@ -61,14 +70,16 @@ def main(argv): """Main function launched by the script.""" del argv - file = FLAGS.file + if FLAGS.file: + logging.warning("--file is deprecated. Please, use --jsonld with a path or URL") + jsonld = FLAGS.jsonld or FLAGS.file record_set = FLAGS.record_set num_records = FLAGS.num_records debug = FLAGS.debug update_output = FLAGS.update_output mapping = FLAGS.mapping return load( - file=file, + jsonld=jsonld, record_set=record_set, num_records=num_records, debug=debug, @@ -78,7 +89,7 @@ def main(argv): def load( - file: str, + jsonld: str, record_set: str | None, num_records: int = _NUM_MAX_RECORDS, debug: bool = False, @@ -93,16 +104,16 @@ def load( file_mapping = json.loads(mapping) except json.JSONDecodeError as e: raise ValueError("--mapping should be a valid dict[str, str]") from e - dataset = mlc.Dataset(file, debug=debug, mapping=file_mapping) + dataset = mlc.Dataset(jsonld, debug=debug, mapping=file_mapping) if record_set is None: record_sets = ", ".join([f"`{rs.name}`" for rs in dataset.metadata.record_sets]) raise ValueError(f"--record_set flag should have a value in {record_sets}") records = dataset.records(record_set) generate_all_records = num_records == -1 if generate_all_records: - print(f"Generating all records from {file}.") + print(f"Generating all records from {jsonld}.") else: - print(f"Generating the first {num_records} records from {file}.") + print(f"Generating the first {num_records} records from {jsonld}.") output_records = [] for i, record in enumerate(records): if not generate_all_records and i >= num_records: @@ -110,8 +121,8 @@ def load( print(record) output_records.append(record_to_python(record)) print("Done.") - if update_output: - output_folder = epath.Path(file).parent / "output" + if update_output and not jsonld.startswith("http"): + output_folder = epath.Path(jsonld).parent / "output" if not output_folder.exists(): output_folder.mkdir() output_file = output_folder / f"{record_set}.jsonl" diff --git a/python/mlcroissant/mlcroissant/scripts/load_test.py b/python/mlcroissant/mlcroissant/scripts/load_test.py index ecc9c1ec9..3f0b07f35 100644 --- a/python/mlcroissant/mlcroissant/scripts/load_test.py +++ b/python/mlcroissant/mlcroissant/scripts/load_test.py @@ -20,7 +20,7 @@ def test_should_raise_when_no_record_set(version): with pytest.raises( ValueError, match="--record_set flag should have a value in `default`" ): - load_lib.load(file=file, record_set=None) + load_lib.load(jsonld=file, record_set=None) @parametrize_version() @@ -35,4 +35,4 @@ def test_should_raise_when_invalid_mapping(version): with pytest.raises( ValueError, match="--mapping should be a valid dict\\[str, str\\]" ): - load_lib.load(file=file, record_set="default", mapping="foobarbaz") + load_lib.load(jsonld=file, record_set="default", mapping="foobarbaz") diff --git a/python/mlcroissant/mlcroissant/scripts/validate.py b/python/mlcroissant/mlcroissant/scripts/validate.py index 130e9cdff..21e227cb2 100644 --- a/python/mlcroissant/mlcroissant/scripts/validate.py +++ b/python/mlcroissant/mlcroissant/scripts/validate.py @@ -3,7 +3,7 @@ Usage: ``` -mlcroissant validate --file /path/to/file.json +mlcroissant validate --jsonld /path/to/file.json ``` """ @@ -16,9 +16,17 @@ import mlcroissant as mlc flags.DEFINE_string( - "file", + "jsonld", None, - "Path to the file to validate.", + "JSON-LD to validate (path to the file or URL).", + required=True, +) + +flags.DEFINE_string( + "file", + "", + "[DEPRECATED] Path to the file to validate.", + required=False, ) flags.DEFINE_bool( @@ -27,7 +35,7 @@ "Whether to print debug hints.", ) -flags.mark_flag_as_required("file") +flags.mark_flag_as_required("jsonld") FLAGS = flags.FLAGS @@ -36,10 +44,12 @@ def main(argv): """Main function launched by the script.""" del argv - file = FLAGS.file + if FLAGS.file: + logging.warning("--file is deprecated. Please, use --jsonld with a path or URL") + jsonld = FLAGS.jsonld or FLAGS.file debug = FLAGS.debug try: - mlc.Dataset(file, debug=debug) + mlc.Dataset(jsonld, debug=debug) logging.info("Done.") except mlc.ValidationError as exception: logging.error(exception)