Skip to content

Commit

Permalink
Introducing a new parameter to get additional metadata (#1926)
Browse files Browse the repository at this point in the history
* extract SotD and abstract if additionaMetadata=true is specified

* describe additional metadata

* fix parameter name

* skip section title for abstract
  • Loading branch information
deniak authored Feb 18, 2025
1 parent 9959469 commit 7972b61
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 21 deletions.
37 changes: 21 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,26 +178,28 @@ Once the [event `end-all`](#validation-events) is emitted, the metadata should b
The `options` accepted are equal to those in `validate()`, except that a `profile` is not necessary and will be ignored (finding out the profile is one of the
goals of this method).

`this.meta` will be an `Object` and may include up to 16 properties described below:
`this.meta` will be an `Object` and may include up to 20 properties described below:

- `profile`
- `title`: The (possible) title of the document.
- `docDate`: The date associated to the document.
- `thisVersion`: URL of this version of the document.
- `latestVersion`: URL of the latest version of the document.
- `previousVersion`: URL of the previous version of the document (the last one, if multiple are shown).
- `editorsDraft`: URL of the latest editor's draft.
- `delivererIDs`: ID(s) of the deliverer(s); an `Array` of `Number`s.
- `editorIDs`: ID(s) of the editor(s) responsible for the document; an `Array` of `Number`s.
- `informative`: Whether the document in informative or not.
- `process`: The process rules link.
- `sameWorkAs`: The previous shortlink if any.
- `implementationFeedbackDue`: The implementation review date for CRs.
- `prReviewsDue`: The review date for PRs.
- `implementationReport`: Implementation report link for CRs, PRs and RECs.
- `errata`: The errata link of the document.
- `title`: The (possible) title of the document
- `docDate`: The date associated to the document
- `thisVersion`: URL of this version of the document
- `latestVersion`: URL of the latest version of the document
- `previousVersion`: URL of the previous version of the document (the last one, if multiple are shown)
- `editorsDraft`: URL of the latest editor's draft
- `delivererIDs`: ID(s) of the deliverer(s); an `Array` of `Number`s
- `editorIDs`: ID(s) of the editor(s) responsible for the document; an `Array` of `Number`s
- `informative`: Whether the document in informative or not
- `process`: The process rules link
- `sameWorkAs`: The previous shortlink if any
- `implementationFeedbackDue`: The implementation review date for CRs
- `prReviewsDue`: The review date for PRs
- `implementationReport`: Implementation report link for CRs, PRs and RECs
- `errata`: The errata link of the document
- `substantiveChanges`: Whether the document is a REC and has proposed amendments
- `newFeatures`: Whether the document is a REC and has proposed additions
- `sotd`: The section "Status of this Document"
- `abstract`: The abstract of the document

If some of these pieces of metadata cannot be deduced, that key will not exist, or its value will not be defined.

Expand Down Expand Up @@ -259,6 +261,9 @@ curl "https://www.w3.org/pubrules/api/metadata?url=https://example.com/doc.html"

# POST
curl "https://www.w3.org/pubrules/api/metadata" -F "file=@/tmp/foo.html"

# GET with additional metadata
curl "https://www.w3.org/pubrules/api/metadata?url=https://example.com/doc.html&additionalMetadata=true"
```

Metadata is a bunch of data extracted from the document. It includes the type (profile) of the document, publish date, editors' names, Patent Policy version the document is under, etc...
Expand Down
1 change: 1 addition & 0 deletions lib/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ const processRequest = async (req, res, params) => {
);
});
options.events = handler;
options.additionalMetadata = req.query.additionalMetadata === 'true';
if (validate) v.validate(options);
else v.extractMetadata(options);
}
Expand Down
2 changes: 2 additions & 0 deletions lib/l10n-en_GB.js
Original file line number Diff line number Diff line change
Expand Up @@ -380,4 +380,6 @@ export const messages = {
'metadata.errata': false,
'metadata.patent-policy': false,
'metadata.charters': false,
'metadata.sotd': false,
'metadata.abstract': false,
};
16 changes: 16 additions & 0 deletions lib/profiles/additionalMetadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* Pseudo-profile for additional metadata extraction.
*/

import { rules as baseRules } from './metadata.js';
import { insertAfter } from './profileUtil.js';

import * as abstract from '../rules/metadata/abstract.js';
import * as sotd from '../rules/metadata/sotd.js';

export const name = 'AdditionalMetadata';

export const rules = insertAfter(baseRules, 'metadata.errata', [
abstract,
sotd,
]);
33 changes: 33 additions & 0 deletions lib/rules/metadata/abstract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* Pseudo-rule for metadata extraction: abstract.
*/

export const name = 'metadata.abstract';

/**
* @param sr
* @param done
*/
export function check(sr, done) {
let abstractTitle;
Array.prototype.some.call(sr.jsDocument.querySelectorAll('h2'), h2 => {
if (sr.norm(h2.textContent).toLowerCase() === 'abstract') {
abstractTitle = h2;
return true;
}
});

if (abstractTitle) {
const div = sr.jsDocument.createElement('div');
[...abstractTitle.parentElement.children].forEach(child => {
{
if (child !== abstractTitle) {
div.appendChild(child.cloneNode(true));
}
}
});
return done({ abstract: sr.norm(div.innerHTML) });
} else {
return done({ abstract: 'Not found' });
}
}
14 changes: 14 additions & 0 deletions lib/rules/metadata/sotd.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Pseudo-rule for metadata extraction: sotd.
*/

export const name = 'metadata.sotd';

/**
* @param sr
* @param done
*/
export function check(sr, done) {
const sotd = sr.getSotDSection();
return done({ sotd: sotd ? sr.norm(sotd.innerHTML) : 'Not found' });
}
3 changes: 2 additions & 1 deletion lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ const buildProcessParamsFunction = function () {
p === 'informativeOnly' ||
p === 'echidnaReady' ||
p === 'events' ||
p === 'editorial'
p === 'editorial' ||
p === 'additionalMetadata'
) {
// Other params:
if (Object.prototype.hasOwnProperty.call(result, p))
Expand Down
11 changes: 7 additions & 4 deletions lib/validator.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import w3cApi from 'node-w3capi';
import { Exceptions } from './exceptions.js';
import { assembleData, setLanguage } from './l10n.js';
import * as profileMetadata from './profiles/metadata.js';
import * as profileAdditionalMetadata from './profiles/additionalMetadata.js';
import { get } from './throttled-ua.js';
import {
AB,
Expand All @@ -22,7 +23,6 @@ import {
TAG,
} from './util.js';

const { rules } = profileMetadata;
const { version } = importJSON('../package.json', import.meta.url);

setLanguage('en_GB');
Expand Down Expand Up @@ -80,10 +80,13 @@ Specberus.prototype.extractMetadata = function (options) {
const doMetadataExtraction = function (err, jsDocument) {
if (err) return self.throw(err);
self.jsDocument = jsDocument;
self.sink.emit('start-all', profileMetadata);
const total = (rules || []).length;
const profile = options.additionalMetadata
? profileAdditionalMetadata
: profileMetadata;
self.sink.emit('start-all', profile);
const total = (profile.rules || []).length;
let done = 0;
rules.forEach(rule => {
profile.rules.forEach(rule => {
try {
rule.check(
self,
Expand Down

0 comments on commit 7972b61

Please sign in to comment.