From cda1cb6fe010d06ddfb37d680d0b9979d1ba7273 Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Sat, 23 Nov 2024 19:47:11 +0200 Subject: [PATCH] chore: add docs --- .gitignore | 1 + ckanext/transmute/logic/action.py | 1 + ckanext/transmute/transmutators.py | 78 +++--- docs/usage.md | 366 ----------------------------- docs/usage/index.md | 142 +++++++++++ docs/usage/schema.md | 38 +++ docs/usage/transmutators.md | 22 ++ docs/usage/type.md | 52 ++++ mkdocs.yml | 6 +- 9 files changed, 298 insertions(+), 408 deletions(-) delete mode 100644 docs/usage.md create mode 100644 docs/usage/index.md create mode 100644 docs/usage/schema.md create mode 100644 docs/usage/transmutators.md create mode 100644 docs/usage/type.md diff --git a/.gitignore b/.gitignore index 8570dc5..b9d3c90 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ coverage.xml # Sphinx documentation docs/_build/ +site/ diff --git a/ckanext/transmute/logic/action.py b/ckanext/transmute/logic/action.py index 738d58c..513533b 100644 --- a/ckanext/transmute/logic/action.py +++ b/ckanext/transmute/logic/action.py @@ -142,6 +142,7 @@ def _process_field( else: data[field.name] = value = field.value + if field.is_multiple(): for nested_field in value or []: # type: ignore _transmute_data(nested_field, definition, field.type) diff --git a/ckanext/transmute/transmutators.py b/ckanext/transmute/transmutators.py index dac8a05..ccc8e24 100644 --- a/ckanext/transmute/transmutators.py +++ b/ckanext/transmute/transmutators.py @@ -10,21 +10,29 @@ from ckanext.transmute.types import Field -_transmutators: dict[str, Callable[..., Any]] = {} SENTINEL = object() def get_transmutators(): - return _transmutators - - -def transmutator(func): - _transmutators[f"tsm_{func.__name__}"] = func - return func - - -@transmutator -def name_validator(field: Field) -> Field: + return { + "tsm_name_validator": tsm_name_validator, + "tsm_to_lowercase": tsm_to_lowercase, + "tsm_to_uppercase": tsm_to_uppercase, + "tsm_string_only": tsm_string_only, + "tsm_isodate": tsm_isodate, + "tsm_to_string": tsm_to_string, + "tsm_stop_on_empty": tsm_stop_on_empty, + "tsm_get_nested": tsm_get_nested, + "tsm_trim_string": tsm_trim_string, + "tsm_concat": tsm_concat, + "tsm_unique_only": tsm_unique_only, + "tsm_mapper": tsm_mapper, + "tsm_list_mapper": tsm_list_mapper, + "tsm_map_value": tsm_map_value, + } + + +def tsm_name_validator(field: Field) -> Field: """Wrapper over CKAN default `name_validator` validator. Args: @@ -42,8 +50,7 @@ def name_validator(field: Field) -> Field: return field -@transmutator -def to_lowercase(field: Field) -> Field: +def tsm_to_lowercase(field: Field) -> Field: """Casts string value to lowercase. Args: @@ -56,8 +63,7 @@ def to_lowercase(field: Field) -> Field: return field -@transmutator -def to_uppercase(field: Field) -> Field: +def tsm_to_uppercase(field: Field) -> Field: """Casts string value to uppercase. Args: @@ -70,8 +76,7 @@ def to_uppercase(field: Field) -> Field: return field -@transmutator -def string_only(field: Field) -> Field: +def tsm_string_only(field: Field) -> Field: """Validates if field.value is string. Args: @@ -88,8 +93,7 @@ def string_only(field: Field) -> Field: return field -@transmutator -def isodate(field: Field) -> Field: +def tsm_isodate(field: Field) -> Field: """Validates datetime string Mutates an iso-like string to datetime object. @@ -113,8 +117,7 @@ def isodate(field: Field) -> Field: return field -@transmutator -def to_string(field: Field) -> Field: +def tsm_to_string(field: Field) -> Field: """Casts field.value to str. Args: @@ -128,8 +131,7 @@ def to_string(field: Field) -> Field: return field -@transmutator -def stop_on_empty(field: Field) -> Field: +def tsm_stop_on_empty(field: Field) -> Field: """Stop transmutation if field is empty. Args: @@ -144,8 +146,7 @@ def stop_on_empty(field: Field) -> Field: return field -@transmutator -def get_nested(field: Field, *path: str) -> Field: +def tsm_get_nested(field: Field, *path: str) -> Field: """Fetches a nested value from a field. Args: @@ -166,8 +167,7 @@ def get_nested(field: Field, *path: str) -> Field: return field -@transmutator -def trim_string(field: Field, max_length: int) -> Field: +def tsm_trim_string(field: Field, max_length: int) -> Field: """Trim string lenght. Args: @@ -184,14 +184,14 @@ def trim_string(field: Field, max_length: int) -> Field: return field -@transmutator -def concat(field: Field, *strings: Any) -> Field: - """Concat strings to build a new one - Use $self to point on field value. +def tsm_concat(field: Field, *strings: Any) -> Field: + """Concatenate strings to build a new one. + + Use `$self` to point on the current field value. Args: - field (Field): Field object - *strings (tuple[str]): strings to concat with + field: Field object + strings: strings to concat with Returns: Field: the same Field with new value @@ -220,8 +220,7 @@ def concat(field: Field, *strings: Any) -> Field: return field -@transmutator -def unique_only(field: Field) -> Field: +def tsm_unique_only(field: Field) -> Field: """Preserve only unique values from list. Args: @@ -236,8 +235,7 @@ def unique_only(field: Field) -> Field: return field -@transmutator -def mapper( +def tsm_mapper( field: Field, mapping: dict[Any, Any], default: Any | None = None ) -> Field: """Map a value with a new value. The initial value must serve as a key within @@ -259,8 +257,7 @@ def mapper( return field -@transmutator -def list_mapper( +def tsm_list_mapper( field: Field, mapping: dict[Any, Any], remove: bool | None = False, @@ -291,8 +288,7 @@ def list_mapper( return field -@transmutator -def map_value( +def tsm_map_value( field: Field, test_value: Any, if_same: Any, diff --git a/docs/usage.md b/docs/usage.md deleted file mode 100644 index f9bf888..0000000 --- a/docs/usage.md +++ /dev/null @@ -1,366 +0,0 @@ - -::: transmute.exception.SchemaParsingError - -[Details](index.md) - -::: transmute.exception - options: - members: [] - - - -[![Tests](https://github.com/DataShades/ckanext-transmute/actions/workflows/test.yml/badge.svg)](https://github.com/DataShades/ckanext-transmute/actions/workflows/test.yml) - -# ckanext-transmute -This extension helps to validate and convert data based on a specific schema. - -## Working with transmute - -`ckanext-transmute` provides an action `tsm_transmute`. It helps us to transmute data with the provided conversion scheme. The action doesn't change the original data but creates a new data dict. There are two mandatory arguments: `data` and `schema`. `data` is a data dict you have, and `schema` helps you to validate/change data in it. - -### Example - -We have a data dict: - -```json -{ - "title": "Test-dataset", - "email": "test@test.ua", - "metadata_created": "", - "metadata_modified": "", - "metadata_reviewed": "", - "resources": [ - { - "title": "test-res", - "extension": "xml", - "web": "https://stackoverflow.com/", - "sub-resources": [ - { - "title": "sub-res", - "extension": "csv", - "extra": "should-be-removed", - } - ], - }, - { - "title": "test-res2", - "extension": "csv", - "web": "https://stackoverflow.com/", - }, - ], - } -``` - -And we want to achieve this: - -```py -{ - "name": "test-dataset", - "email": "test@test.ua", - "metadata_created": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), - "metadata_modified": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), - "metadata_reviewed": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), - "attachments": [ - { - "name": "test-res", - "format": "XML", - "url": "https://stackoverflow.com/", - "sub-resources": [{"name": "SUB-RES", "format": "CSV"}], - }, - { - "name": "test-res2", - "format": "CSV", - "url": "https://stackoverflow.com/", - }, - ], - } -``` - -Then, our schema must be something like that: - -``` -{ - "root": "Dataset", - "types": { - "Dataset": { - "fields": { - "title": { - "validators": [ - "tsm_string_only", - "tsm_to_lowercase", - "tsm_name_validator", - ], - "map": "name", - }, - "resources": { - "type": "Resource", - "multiple": True, - "map": "attachments", - }, - "metadata_created": { - "validators": ["tsm_isodate"], - "default": "2022-02-03T15:54:26.359453", - }, - "metadata_modified": { - "validators": ["tsm_isodate"], - "default_from": "metadata_created", - }, - "metadata_reviewed": { - "validators": ["tsm_isodate"], - "replace_from": "metadata_modified", - }, - } - }, - "Resource": { - "fields": { - "title": { - "validators": ["tsm_string_only"], - "map": "name", - }, - "extension": { - "validators": ["tsm_string_only", "tsm_to_uppercase"], - "map": "format", - }, - "web": { - "validators": ["tsm_string_only"], - "map": "url", - }, - "sub-resources": { - "type": "Sub-Resource", - "multiple": True, - }, - }, - }, - "Sub-Resource": { - "fields": { - "title": { - "validators": ["tsm_string_only", "tsm_to_uppercase"], - "map": "name", - }, - "extension": { - "validators": ["tsm_string_only", "tsm_to_uppercase"], - "map": "format", - }, - "extra": { - "remove": True, - }, - } - }, - }, - } -``` - -There is an example of schema with nested types. The `root` field is mandatory, it's must contain a main type name, from which the scheme starts. As you can see, `Dataset` type contains `Resource` type which contans `Sub-Resource`. - -### Transmutators - -There are a few default transmutators you can use in your schema. Of course, you can define a custom transmutator with the `ITransmute ` interface. - -- `tsm_name_validator` - Wrapper over CKAN default `name_validator` validator. -- `tsm_to_lowercase` - Casts string value to lowercase. -- `tsm_to_uppercase` - Casts string value to uppercase. -- `tsm_string_only` - Validates if `field.value` is a string. -- `tsm_isodate` - Validates datetime string. Mutates an iso-like string to datetime object. -- `tsm_to_string` - Casts a `field.value` to `str`. -- `tsm_get_nested` - Allows you to pick up a value from a nested structure. Example: -```py -data = "title_translated": [ - {"nested_field": {"en": "en title", "ar": "العنوان ar"}}, -] - -schema = ... - "title": { - "replace_from": "title_translated", - "validators": [ - ["tsm_get_nested", 0, "nested_field", "en"], - "tsm_to_uppercase", - ], - }, - ... -``` -This will take a value for a `title` field from `title_translated` field. Because `title_translated` is an array with nested objects, we are using the `tsm_get_nested` transmutator to achieve the value from it. - -- `tsm_trim_string` - Trim string with max length. Example to trim `hello world` to `hello`: -```py -data = {"field_name": "hello world} - -schema = ... - "field_name": { - "validators": [ - ["tsm_trim_string", 5] - ], - }, - ... -``` -- `tsm_concat` - Concatenate strings. Use `$self` to point on field value. Example: -```py -data = {"id": "dataset-1"} - -schema = ... - "package_url": { - "replace_from": "id", - "validators": [ - [ - "tsm_concat", - "https://site.url/dataset/", - "$self", - ] - ], - }, - ... -``` -- `tsm_unique_only` - Preserve only unique values from a list. Works only with lists. - - -The default transmutator must receive at least one mandatory argument - `field` object. Field contains few properties: `field_name`, `value` and `type`. - -There is a possibility to provide more arguments to a validator like in `tsm_get_nested`. For this use a nested array with first item transmutator and other - arguments to it. - -- `tsm_mapper` - Map current value to the mapping dict - -Map a value to another value. The current value must serve as a key within the mapping dictionary, while the new value will represent the updated value. - -The default value to be used when the key is not found in the mapping. If the default value is not provided, the current value will be used as it. - -```py -data = {"language": "English"} - -schema = ... - "language": { - "validators": [ - [ - "tsm_mapper", - {"English": "eng"}, - "English" - ] - ] - }, - ... -``` - -- `tsm_list_mapper` - Map current value to the mapping dict - -Works as `tsm_mapper` but with list. Doesn't have a `default` value. Third argument `remove` must be `True` or `False`. - -If `remove` set to True, removes values from the list if they don't have a corresponding mapping. Defaults to `False`. - -Example without `remove`: - -```py -data = {"topic": ["Health", "Military", "Utilities"]} - -schema = ... - "topic": { - "validators": [ - [ - "tsm_list_mapper", - {"Military": "Army", "Utilities": "Utility"} - ] - ] - }, - ... -``` - -The result here will be `["Health", "Army", "Utility"]` -And here's an example with remove: - -```py -data = {"topic": ["Health", "Military", "Utilities"]} - -schema = build_schema( - "topic": { - "validators": [ - [ - "tsm_list_mapper", - {"Military": "Army", "Utilities": "Utility"}, - True - ] - ] - }, - ... -) -``` -This will result in `["Army", "Utility"]`, and the `Health` will be deleted, cause it doesn't have a mapping. - -### Keywords -1. `map` (`str`) - changes the `field.name` in result dict. -2. `validators` (`list[str]`) - a list of transmutators that will be applied to a `field.value`. A transmutator could be a `string` or a `list` where the first item must be transmutator name and others are arbitrary values. Example: - ``` - ... - "validators": [ - ["tsm_get_nested", "nested_field", "en"], - "tsm_to_uppercase", - , - ... - ``` - There are two transmutators: `tsm_get_nested` and `tsm_to_uppercase`. -3. `multiple` (`bool`, default: `False`) - if the field could have multiple items, e.g `resources` field in dataset, mark it as `multiple` to transmute all the items successively. - ``` - ... - "resources": { - "type": "Resource", - "multiple": True - }, - ... - ``` -4. `remove` (`bool`, default: `False`) - Removes a field from a result dict if `True`. -5. `default` (`Any`) - the default value that will be used if the original field.value evaluates to `False`. -6. `default_from` (`str` | `list`) - acts similar to `default` but accepts a `field.name` of a sibling field from which we want to take its value. Sibling field is a field that located in the same `type`. The current implementation doesn't allow to point on fields from other `types`. Could take a string that represents the `field.name` or an array of strings, to use multiple fields. See `inherit_mode` keyword for details. - ``` - ... - "metadata_modified": { - "validators": ["tsm_isodate"], - "default_from": "metadata_created", - }, - ... - ``` -7. `replace_from` (`str`| `list`) - acts similar to `default_from` but replaces the origin value whenever it's empty or not. -8. `inherit_mode` (`str`, default: `combine`) - defines the mode for `default_from` and `replace_from`. By default we are combining values -from all the fields, but we could just use first non-false value, in case if the field might be empty. -9. `value` (`Any`) - a value that will be used for a field. This keyword has the highest priority. Could be used to create a new field with an arbitrary value. -10. `update` (`bool`, default: `False`) - if the original value is mutable (`array`, `object`) - you can update it. You can only update field values of the same types. - -## Installation - -To install ckanext-transmute: - -1. Activate your CKAN virtual environment, for example: - - . /usr/lib/ckan/default/bin/activate - -2. Clone the source and install it on the virtualenv - - git clone https://github.com/DataShades/ckanext-transmute.git - cd ckanext-transmute - pip install -e . - pip install -r requirements.txt - -3. Add `transmute` to the `ckan.plugins` setting in your CKAN - config file (by default the config file is located at - `/etc/ckan/default/ckan.ini`). - -4. Restart CKAN. For example if you've deployed CKAN with Apache on Ubuntu: - - sudo service apache2 reload - - -## Developer installation - -To install ckanext-transmute for development, activate your CKAN virtualenv and -do: - - git clone https://github.com/DataShades/ckanext-transmute.git - cd ckanext-transmute - python setup.py develop - pip install -r dev-requirements.txt - - -## Tests - -I've used TDD to write this extension, so if you changing something be sure that all the tests are valid. To run the tests, do: - - pytest --ckan-ini=test.ini - -## License - -[AGPL](https://www.gnu.org/licenses/agpl-3.0.en.html) diff --git a/docs/usage/index.md b/docs/usage/index.md new file mode 100644 index 0000000..1e48d29 --- /dev/null +++ b/docs/usage/index.md @@ -0,0 +1,142 @@ +# Overview + +`ckanext-transmute` registers an action `tsm_transmute` to transmute data using +the provided conversion scheme. The action doesn't change the original data but +creates a new data dict. There are two mandatory arguments: `data` and +`schema`. `data` is a data dict you need to transform, and `schema` contains +the rules describing all the transformation steps. + +Typical use-case for it is transforming existing data, like this: + +```json +{ + "title": "Test-dataset", + "email": "test@test.ua", + "metadata_created": "", + "metadata_modified": "", + "metadata_reviewed": "", + "resources": [ + { + "title": "test-res", + "extension": "xml", + "web": "https://stackoverflow.com/", + "sub-resources": [ + { + "title": "sub-res", + "extension": "csv", + "extra": "should-be-removed" + } + ] + }, + { + "title": "test-res2", + "extension": "csv", + "web": "https://stackoverflow.com/" + } + ] +} + +``` + +into expected data, like this: + +```py +{ + "name": "test-dataset", + "email": "test@test.ua", + "metadata_created": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), + "metadata_modified": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), + "metadata_reviewed": datetime.datetime(2022, 2, 3, 15, 54, 26, 359453), + "attachments": [ + { + "name": "test-res", + "format": "XML", + "url": "https://stackoverflow.com/", + "sub-resources": [{"name": "SUB-RES", "format": "CSV"}] + }, + { + "name": "test-res2", + "format": "CSV", + "url": "https://stackoverflow.com/" + } + ] +} +``` + +To achieve this goal, the following schema definition can be used: +```python +{ + "root": "Dataset", + "types": { + "Dataset": { + "fields": { + "title": { + "validators": [ + "tsm_string_only", + "tsm_to_lowercase", + "tsm_name_validator", + ], + "map": "name", + }, + "resources": { + "type": "Resource", + "multiple": True, + "map": "attachments", + }, + "metadata_created": { + "validators": ["tsm_isodate"], + "default": "2022-02-03T15:54:26.359453", + }, + "metadata_modified": { + "validators": ["tsm_isodate"], + "default_from": "metadata_created", + }, + "metadata_reviewed": { + "validators": ["tsm_isodate"], + "replace_from": "metadata_modified", + }, + } + }, + "Resource": { + "fields": { + "title": { + "validators": ["tsm_string_only"], + "map": "name", + }, + "extension": { + "validators": ["tsm_string_only", "tsm_to_uppercase"], + "map": "format", + }, + "web": { + "validators": ["tsm_string_only"], + "map": "url", + }, + "sub-resources": { + "type": "Sub-Resource", + "multiple": True, + }, + }, + }, + "Sub-Resource": { + "fields": { + "title": { + "validators": ["tsm_string_only", "tsm_to_uppercase"], + "map": "name", + }, + "extension": { + "validators": ["tsm_string_only", "tsm_to_uppercase"], + "map": "format", + }, + "extra": { + "remove": True, + }, + } + }, + }, +} +``` + +This is an example of schema with nested types. The `root` field defines the +type of the outer layer of data, while `sub-resources` field inside the +definition of the root type contain `type` references to `Sub-Resource` +definition. diff --git a/docs/usage/schema.md b/docs/usage/schema.md new file mode 100644 index 0000000..77c68cd --- /dev/null +++ b/docs/usage/schema.md @@ -0,0 +1,38 @@ +Transmutation schema represented by a dictionary that contains descriptions of +all data types used for transmutation and the name of the `root` type. + +```json +{ + "root": "main", + "types": { + "main": {}, + "secondary": {} + } +} +``` + +The `root` type is used for the initial transformation. If, during this +transformation, some of multi-values fields contain reference to other types +defined in schema, these types will be used for further transformation of data. + +```json +{ + "root": "main", + "types": { + "main": { + "drop_unknown_fields": true, + "fields": { + "child": {"type": "secondary", "multiple": true} + } + }, + "secondary": { + "drop_unknown_fields": true, + "fields": {"name": {}} + } + } +} +``` + +!!! note + At the moment, only multivalued fields can be transformed using nested + types. In future support for single-valued nested field will be added diff --git a/docs/usage/transmutators.md b/docs/usage/transmutators.md new file mode 100644 index 0000000..f43eab6 --- /dev/null +++ b/docs/usage/transmutators.md @@ -0,0 +1,22 @@ +Transmutators are similar to CKAN validators. They accept the field and modify +it. But unlike validators, transmutators work with field and have access to the +whole schema definition. + +Usually, transmutator is defined as a function with a single argument. This +argument always receives the instance of validated field. It's a dataclass with + +* `field_name`: the name of processed field +* `value`: current value of the field +* `type`: the name of the type that contains field definition +* `data`: the whole data dictionary that is currently transmuted + +Transmutator modifies field in place and returns the whole field when job is done. + + +ckanext-transmute contains a number of transmutators that can be used without +additional configuration. And if you need more, you can define a custom +transmutator with the `ITransmute ` interface. + +::: transmute.transmutators + options: + show_root_heading: false diff --git a/docs/usage/type.md b/docs/usage/type.md new file mode 100644 index 0000000..d6aeb96 --- /dev/null +++ b/docs/usage/type.md @@ -0,0 +1,52 @@ +Type description contains definition of its fields and a number of additional +settings. + + +```json +{ + "root": "main", + "types": { + "main": { + "drop_unknown_fields": true, + "fields": { + "first": {}, + "second": {} + } + } + } +} +``` + +Every field either refers a different type if it's definded with `multiple: +true` and `type: TYPE_NAME`, or contains inline definition. Inline fields are +used most often and their definition is flexible enough to cover majority of +use-cases. + +```json +{ + "root": "main", + "types": { + "main": { + "fields": { + "inline_field": {"default": 42}, + "sub_type": {"multiple": true, "type": "secondary"} + } + }, + "secondary": {} + } +} +``` + +Here's the list of attributes that can be used in the field definition: + +| Attribute | Description | +|--------------------|-----------------------------------------------------------------------| +| `map` | New name of the field | +| `validators` | List of transmutators applied to the field | +| `remove` | Flag that removes field from data when enabled | +| `default` | Default value if field is missing | +| `default_from` | Name of the field used as source of default value | +| `value` | Static value that replaces any existing value of the field | +| `replace_from` | Name of the field used as a source of value | +| `validate_missing` | Flag that applies validation even if data does not contains the field | +| `weight` | Weight that controls order of field processing | diff --git a/mkdocs.yml b/mkdocs.yml index 69662d7..b0c7727 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,7 +76,11 @@ theme: nav: - index.md - installation.md - - usage.md + - Usage: + - usage/index.md + - usage/schema.md + - usage/type.md + - usage/transmutators.md - api.md - interfaces.md - configuration.md