mlcommons · marcenacp · Feb 6, 2024 · Feb 6, 2024
@@ -196,7 +196,7 @@ jobs:
         for file in ${JSON_FILES}
         do
           echo "Validating ${file}..."
-          python mlcroissant/scripts/validate.py --file ${file}
+          python mlcroissant/scripts/validate.py --jsonld ${file}
         done
 
   generation-test:
@@ -220,10 +220,10 @@ jobs:
       run: pip install .
 
     - name: Generate JSON-LD files - Titanic
-      run: mlcroissant load --file ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers
+      run: mlcroissant load --jsonld ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers
 
     - name: Generate JSON-LD files - PASS
-      run: pip install .[image] && python mlcroissant/scripts/load.py --file ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images
+      run: pip install .[image] && python mlcroissant/scripts/load.py --jsonld ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images
 
   editor-python-test:
     name: Editor Python Test

@@ -36,7 +36,7 @@ sudo apt-get install python3-dev graphviz libgraphviz-dev pkg-config
 ## Verify/load a Croissant dataset
 
 ```bash
-mlcroissant validate --file ../../datasets/titanic/metadata.json
+mlcroissant validate --jsonld ../../datasets/titanic/metadata.json
 ```
 
 The command:
@@ -48,7 +48,7 @@ Similarly, you can generate a dataset by launching:
 
 ```bash
 mlcroissant load \
-    --file ../../datasets/titanic/metadata.json \
+    --jsonld ../../datasets/titanic/metadata.json \
     --record_set passengers \
     --num_records 10
 ```
@@ -123,7 +123,7 @@ Alternatively, you can contribute to `mlcroissant` using the "classic" GitHub wo
 You can debug the validation of the file using the `--debug` flag:
 
 ```bash
-mlcroissant validate --file ../../datasets/titanic/metadata.json --debug
+mlcroissant validate --jsonld ../../datasets/titanic/metadata.json --debug
 ```
 
 This will:

@@ -55,7 +55,7 @@ def load_records_and_test_equality(
 ):
     print(
         "If this test fails, update JSONL with: `mlcroissant load"
-        f" --file ../../datasets/{version}/{dataset_name} --record_set"
+        f" --jsonld ../../datasets/{version}/{dataset_name} --record_set"
         f" {record_set_name} --num_records {num_records} --debug --update_output`"
     )
     config = (
@@ -85,7 +85,7 @@ def load_records_and_test_equality(
 # You can regenerate .pkl files by launching
 # ```bash
 # mlcroissant load \
-#   --file ../../datasets/{{version}}/{{dataset_name}}/metadata.json \
+#   --jsonld ../../datasets/{{version}}/{{dataset_name}}/metadata.json \
 #   --record_set {{record_set_name}} \
 #   --update_output \
 #   --num_records -1

@@ -5,6 +5,7 @@
 
 from absl import app
 from absl import flags
+from absl import logging
 from etils import epath
 
 import mlcroissant as mlc
@@ -14,9 +15,17 @@
 
 
 flags.DEFINE_string(
-    "file",
+    "jsonld",
     None,
-    "Path to the file to validate.",
+    "JSON-LD to validate (path to the file or URL).",
+    required=True,
+)
+
+flags.DEFINE_string(
+    "file",
+    "",
+    "[DEPRECATED] Path to the file to validate.",
+    required=False,
 )
 
 flags.DEFINE_string(
@@ -52,7 +61,7 @@
     ' "~/Downloads/document.csv"}\'`.',
 )
 
-flags.mark_flag_as_required("file")
+flags.mark_flag_as_required("jsonld")
 
 
 FLAGS = flags.FLAGS
@@ -61,14 +70,16 @@
 def main(argv):
     """Main function launched by the script."""
     del argv
-    file = FLAGS.file
+    if FLAGS.file:
+        logging.warning("--file is deprecated. Please, use --jsonld with a path or URL")
+    jsonld = FLAGS.jsonld or FLAGS.file
     record_set = FLAGS.record_set
     num_records = FLAGS.num_records
     debug = FLAGS.debug
     update_output = FLAGS.update_output
     mapping = FLAGS.mapping
     return load(
-        file=file,
+        jsonld=jsonld,
         record_set=record_set,
         num_records=num_records,
         debug=debug,
@@ -78,7 +89,7 @@ def main(argv):
 
 
 def load(
-    file: str,
+    jsonld: str,
     record_set: str | None,
     num_records: int = _NUM_MAX_RECORDS,
     debug: bool = False,
@@ -93,25 +104,25 @@ def load(
             file_mapping = json.loads(mapping)
         except json.JSONDecodeError as e:
             raise ValueError("--mapping should be a valid dict[str, str]") from e
-    dataset = mlc.Dataset(file, debug=debug, mapping=file_mapping)
+    dataset = mlc.Dataset(jsonld, debug=debug, mapping=file_mapping)
     if record_set is None:
         record_sets = ", ".join([f"`{rs.name}`" for rs in dataset.metadata.record_sets])
         raise ValueError(f"--record_set flag should have a value in {record_sets}")
     records = dataset.records(record_set)
     generate_all_records = num_records == -1
     if generate_all_records:
-        print(f"Generating all records from {file}.")
+        print(f"Generating all records from {jsonld}.")
     else:
-        print(f"Generating the first {num_records} records from {file}.")
+        print(f"Generating the first {num_records} records from {jsonld}.")
     output_records = []
     for i, record in enumerate(records):
         if not generate_all_records and i >= num_records:
             break
         print(record)
         output_records.append(record_to_python(record))
     print("Done.")
-    if update_output:
-        output_folder = epath.Path(file).parent / "output"
+    if update_output and not jsonld.startswith("http"):
+        output_folder = epath.Path(jsonld).parent / "output"
         if not output_folder.exists():
             output_folder.mkdir()
         output_file = output_folder / f"{record_set}.jsonl"

@@ -20,7 +20,7 @@ def test_should_raise_when_no_record_set(version):
     with pytest.raises(
         ValueError, match="--record_set flag should have a value in `default`"
     ):
-        load_lib.load(file=file, record_set=None)
+        load_lib.load(jsonld=file, record_set=None)
 
 
 @parametrize_version()
@@ -35,4 +35,4 @@ def test_should_raise_when_invalid_mapping(version):
     with pytest.raises(
         ValueError, match="--mapping should be a valid dict\\[str, str\\]"
     ):
-        load_lib.load(file=file, record_set="default", mapping="foobarbaz")
+        load_lib.load(jsonld=file, record_set="default", mapping="foobarbaz")
@@ -3,7 +3,7 @@
 Usage:
 
 ```
-mlcroissant validate --file /path/to/file.json
+mlcroissant validate --jsonld /path/to/file.json
 ```
 """
 
@@ -16,9 +16,17 @@
 import mlcroissant as mlc
 
 flags.DEFINE_string(
-    "file",
+    "jsonld",
     None,
-    "Path to the file to validate.",
+    "JSON-LD to validate (path to the file or URL).",
+    required=True,
+)
+
+flags.DEFINE_string(
+    "file",
+    "",
+    "[DEPRECATED] Path to the file to validate.",
+    required=False,
 )
 
 flags.DEFINE_bool(
@@ -27,7 +35,7 @@
     "Whether to print debug hints.",
 )
 
-flags.mark_flag_as_required("file")
+flags.mark_flag_as_required("jsonld")
 
 
 FLAGS = flags.FLAGS
@@ -36,10 +44,12 @@
 def main(argv):
     """Main function launched by the script."""
     del argv
-    file = FLAGS.file
+    if FLAGS.file:
+        logging.warning("--file is deprecated. Please, use --jsonld with a path or URL")
+    jsonld = FLAGS.jsonld or FLAGS.file
     debug = FLAGS.debug
     try:
-        mlc.Dataset(file, debug=debug)
+        mlc.Dataset(jsonld, debug=debug)
         logging.info("Done.")
     except mlc.ValidationError as exception:
         logging.error(exception)