Skip to content

Commit 0911cdf

Browse files
authored
Align the API and add better documentation. (#488)
1 parent 7ce531d commit 0911cdf

File tree

5 files changed

+19
-16
lines changed

5 files changed

+19
-16
lines changed

python/mlcroissant/mlcroissant/_src/datasets.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,15 @@ class Dataset:
3939
"""Python representation of a Croissant dataset.
4040
4141
Args:
42-
file: A JSON object or a path to a Croissant file (string or pathlib.Path).
43-
operations: The operation graph class. None by default.
42+
jsonld: A JSON object or a path to a Croissant file (URL, str or pathlib.Path).
4443
debug: Whether to print debug hints. False by default.
44+
mapping: Mapping filename->filepath as a Python dict[str, str] to handle manual
45+
downloads. If `document.csv` is the FileObject and you downloaded it to
46+
`~/Downloads/document.csv`, you can specify `mapping={"document.csv":
47+
"~/Downloads/document.csv"}`.,
4548
"""
4649

47-
file: epath.PathLike | str | dict[str, Any] | None
50+
jsonld: epath.PathLike | str | dict[str, Any] | None
4851
operations: OperationGraph = dataclasses.field(init=False)
4952
metadata: Metadata = dataclasses.field(init=False)
5053
debug: bool = False
@@ -54,10 +57,10 @@ def __post_init__(self):
5457
"""Runs the static analysis of `file`."""
5558
ctx = Context()
5659
ctx.mapping = _expand_mapping(self.mapping)
57-
if isinstance(self.file, dict):
58-
self.metadata = Metadata.from_json(ctx=ctx, json_=self.file)
59-
elif self.file is not None:
60-
self.metadata = Metadata.from_file(ctx=ctx, file=self.file)
60+
if isinstance(self.jsonld, dict):
61+
self.metadata = Metadata.from_json(ctx=ctx, json_=self.jsonld)
62+
elif self.jsonld is not None:
63+
self.metadata = Metadata.from_file(ctx=ctx, file=self.jsonld)
6164
else:
6265
return
6366
# Draw the structure graph for debugging purposes.
@@ -71,7 +74,7 @@ def __post_init__(self):
7174
@classmethod
7275
def from_metadata(cls, metadata: Metadata) -> Dataset:
7376
"""Creates a new `Dataset` from a `Metadata`."""
74-
dataset = Dataset(file=None)
77+
dataset = Dataset(jsonld=None)
7578
dataset.metadata = metadata
7679
dataset.operations = get_operations(metadata.ctx, metadata)
7780
return dataset

python/mlcroissant/mlcroissant/_src/torch/torch_adapter/dataloader.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ def apply_data_type_transformation(
5555
class LoaderFactory:
5656
"""Used to create loaders and get metadata."""
5757

58-
def __init__(self, file: str):
58+
def __init__(self, jsonld: str):
5959
"""Initialize LoaderFactory with a Croissant file."""
60-
self.file = file
60+
self.jsonld = jsonld
6161

6262
def _get_row_processor(self, specification: LoaderSpecificationTypes):
6363
"""Remap columns types to desired type."""
@@ -78,7 +78,7 @@ def as_datapipe(
7878
if dp is None:
7979
raise NotImplementedError(INSTALL_MESSAGE)
8080

81-
dataset = Dataset(file=self.file)
81+
dataset = Dataset(jsonld=self.jsonld)
8282
records = dataset.records(record_set=record_set)
8383
datapipe = dp.iter.IterableWrapper(records)
8484
if specification:

python/mlcroissant/recipes/PyTorch_FLORES200.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
"metadata": {},
101101
"outputs": [],
102102
"source": [
103-
"ta_factory = mlc.torch.LoaderFactory(file=\"../../../datasets/flores-200/metadata.json\")\n",
103+
"ta_factory = mlc.torch.LoaderFactory(jsonld=\"../../../datasets/flores-200/metadata.json\")\n",
104104
"specification = {\n",
105105
" \"translation\": mlc.torch.LoaderSpecificationDataType.INFER,\n",
106106
" \"language\": mlc.torch.LoaderSpecificationDataType.INFER,\n",

python/mlcroissant/recipes/bounding-boxes.ipynb

+3-3
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@
123123
"metadata": {},
124124
"outputs": [],
125125
"source": [
126-
"file = epath.Path(\"croissant.json\")\n",
127-
"with file.open(\"w\") as f:\n",
126+
"jsonld = epath.Path(\"croissant.json\")\n",
127+
"with jsonld.open(\"w\") as f:\n",
128128
" f.write(json.dumps(metadata.to_json(), indent=2))\n",
129129
"\n",
130130
"!grep -C 7 -n \"cr:BoundingBox\" croissant.json"
@@ -147,7 +147,7 @@
147147
"metadata": {},
148148
"outputs": [],
149149
"source": [
150-
"dataset = mlc.Dataset(file=file)\n",
150+
"dataset = mlc.Dataset(jsonld=jsonld)\n",
151151
"records = dataset.records(record_set=record_set)\n",
152152
"record = next(iter(records))\n",
153153
"print(\"The first record:\")\n",

python/mlcroissant/recipes/introduction.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@
274274
{
275275
"cell_type": "code",
276276
"source": [
277-
"dataset = mlc.Dataset(file=\"croissant.json\")"
277+
"dataset = mlc.Dataset(jsonld=\"croissant.json\")"
278278
],
279279
"metadata": {
280280
"id": "_JNyQFuAEiIs"

0 commit comments

Comments
 (0)