From 833d6e7a657c652bb60939ddcfd997ccdf5fa4a7 Mon Sep 17 00:00:00 2001 From: Demian Katz Date: Thu, 23 Sep 2021 10:41:53 -0400 Subject: [PATCH] Refactor metadata extraction to a separate class. --- api/src/services/HierarchyCollector.ts | 190 ++------------------- api/src/services/MetadataExtractor.ts | 228 +++++++++++++++++++++++++ 2 files changed, 246 insertions(+), 172 deletions(-) create mode 100644 api/src/services/MetadataExtractor.ts diff --git a/api/src/services/HierarchyCollector.ts b/api/src/services/HierarchyCollector.ts index fe754fc1..ac737cd0 100644 --- a/api/src/services/HierarchyCollector.ts +++ b/api/src/services/HierarchyCollector.ts @@ -1,187 +1,33 @@ -import { DC, Fedora } from "./Fedora"; +import { Fedora } from "./Fedora"; import Config from "../models/Config"; import FedoraData from "../models/FedoraData"; -import { DOMParser } from "@xmldom/xmldom"; -import xpath = require("xpath"); +import MetadataExtractor from "./MetadataExtractor"; import TikaExtractor from "./TikaExtractor"; class HierarchyCollector { private static instance: HierarchyCollector; fedora: Fedora; - // PIDs that define the top of a hierarchy. Typically this - // includes the overall top PID, plus the top public PID. - hierarchyTops: Array; + extractor: MetadataExtractor; config: Config; - constructor(fedora: Fedora, config: Config) { + constructor(fedora: Fedora, extractor: MetadataExtractor, config: Config) { this.fedora = fedora; + this.extractor = extractor; this.config = config; } public static getInstance(): HierarchyCollector { if (!HierarchyCollector.instance) { - HierarchyCollector.instance = new HierarchyCollector(Fedora.getInstance(), Config.getInstance()); + HierarchyCollector.instance = new HierarchyCollector( + Fedora.getInstance(), + MetadataExtractor.getInstance(), + Config.getInstance() + ); } return HierarchyCollector.instance; } - protected extractMetadata(dc: DC): Record> { - if (typeof dc.children === "undefined") { - throw new Error("Unexpected failure: childless Dublin Core!"); - } - const metadata: Record> = {}; - dc.children.forEach((field) => { - if (typeof metadata[field.name] === "undefined") { - metadata[field.name] = []; - } - metadata[field.name].push(field.value); - }); - return metadata; - } - - protected extractRDFXML( - xml: Document, - namespaces: Record, - xpathQuery: string - ): Record> { - const rdfXPath = xpath.useNamespaces(namespaces); - const relations: Record> = {}; - rdfXPath(xpathQuery, xml).forEach((relation: Node) => { - let values = rdfXPath("text()", relation) as Array; - // If there's a namespace on the node name, strip it: - const nodeName = relation.nodeName.split(":").pop(); - if (values.length === 0) { - values = rdfXPath("./@rdf:resource", relation) as Array; - } - if (values.length > 0) { - if (typeof relations[nodeName] === "undefined") { - relations[nodeName] = []; - } - relations[nodeName].push(values[0].nodeValue); - } - }); - return relations; - } - - protected extractRelations(RELS: string): Record> { - const xmlParser = new DOMParser(); - const RELS_XML = xmlParser.parseFromString(RELS, "text/xml"); - return this.extractRDFXML( - RELS_XML, - { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - }, - "//rdf:Description/*" - ); - } - - protected extractFedoraDetails(RDF: string): Record> { - const xmlParser = new DOMParser(); - const RDF_XML = xmlParser.parseFromString(RDF, "text/xml"); - const details = this.extractRDFXML( - RDF_XML, - { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - fedora: "http://fedora.info/definitions/v4/repository#", - "fedora3-model": "info:fedora/fedora-system:def/model#", - "fedora3-view": "info:fedora/fedora-system:def/view#", - }, - "//rdf:Description/fedora:*|//rdf:Description/fedora3-model:*|//rdf:Description/fedora3-view:*" - ); - // The new (F6) created and lastModified properties should take - // precedence over the legacy (F3) createdDate and lastModifiedDate - // properties when present. - if (typeof details.created !== "undefined") { - details.createdDate = details.created; - delete details.created; - } - if (typeof details.lastModified !== "undefined") { - details.lastModifiedDate = details.lastModified; - delete details.lastModified; - } - return details; - } - - protected extractFedoraDatastreams(RDF: string): Array { - const xmlParser = new DOMParser(); - const RDF_XML = xmlParser.parseFromString(RDF, "text/xml"); - const raw = - this.extractRDFXML( - RDF_XML, - { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - ldp: "http://www.w3.org/ns/ldp#", - }, - "//ldp:contains" - )["contains"] ?? []; - return raw.map((ds) => { - return ds.split("/").pop(); - }); - } - - protected extractLicense(XML: string): string { - const xmlParser = new DOMParser(); - const parsedXml = xmlParser.parseFromString(XML, "text/xml"); - const namespaces = { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - METS: "http://www.loc.gov/METS/", - xlink: "http://www.w3.org/1999/xlink", - }; - const rdfXPath = xpath.useNamespaces(namespaces); - let license = null; - rdfXPath("//@xlink:href", parsedXml).forEach((relation: Node) => { - license = relation.nodeValue; - }); - return license; - } - - protected extractAgents(xml: string): Record> { - const xmlParser = new DOMParser(); - const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); - return this.extractRDFXML( - RDF_XML, - { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - METS: "http://www.loc.gov/METS/", - }, - "//METS:agent/*" - ); - } - - protected extractFitsData(xml: string): Record> { - const xmlParser = new DOMParser(); - const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); - const namespaces = { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - fits: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output", - }; - const details = this.extractRDFXML( - RDF_XML, - namespaces, - "//fits:fileinfo/fits:size|//fits:imageWidth|//fits:imageHeight" - ); - details.mimetype = []; - const fitsXPath = xpath.useNamespaces(namespaces); - fitsXPath("//fits:identity/@mimetype", RDF_XML).forEach((relation: Node) => { - details.mimetype.push(relation.nodeValue); - }); - return details; - } - - protected extractThumbnailDetails(xml: string): Record> { - const xmlParser = new DOMParser(); - const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); - return this.extractRDFXML( - RDF_XML, - { - rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - premis: "http://www.loc.gov/premis/rdf/v1#", - }, - "//premis:*" - ); - } - async getFedoraData(pid: string, fetchRdf = true): Promise { // Use Fedora to get data const DCPromise = this.fedora.getDC(pid); @@ -191,25 +37,25 @@ class HierarchyCollector { // we can skip fetching more RDF in order to save some time! const RDFPromise = fetchRdf ? this.fedora.getRdf(pid) : null; const [DC, RELS, RDF] = await Promise.all([DCPromise, RELSPromise, RDFPromise]); - const dataStreams = fetchRdf ? this.extractFedoraDatastreams(RDF) : []; - const relations = this.extractRelations(RELS); + const dataStreams = fetchRdf ? this.extractor.extractFedoraDatastreams(RDF) : []; + const relations = this.extractor.extractRelations(RELS); // Fetch license details if appropriate/available: const extraDetails: Record>> = {}; if (dataStreams.includes("LICENSE")) { const licenseStream = await this.fedora.getDatastreamAsString(pid, "LICENSE"); - extraDetails.license = { url: [this.extractLicense(licenseStream)] }; + extraDetails.license = { url: [this.extractor.extractLicense(licenseStream)] }; } if (dataStreams.includes("AGENTS")) { const agentsStream = await this.fedora.getDatastreamAsString(pid, "AGENTS"); - extraDetails.agents = this.extractAgents(agentsStream); + extraDetails.agents = this.extractor.extractAgents(agentsStream); } if (dataStreams.includes("THUMBNAIL")) { const thumbRdf = await this.fedora.getRdf(pid + "/THUMBNAIL/fcr:metadata"); - extraDetails.thumbnails = this.extractThumbnailDetails(thumbRdf); + extraDetails.thumbnails = this.extractor.extractThumbnailDetails(thumbRdf); } if (dataStreams.includes("MASTER-MD")) { const fitsXml = await this.fedora.getDatastreamAsString(pid, "MASTER-MD"); - extraDetails.fitsData = this.extractFitsData(fitsXml); + extraDetails.fitsData = this.extractor.extractFitsData(fitsXml); } extraDetails.fullText = {}; if (dataStreams.includes("OCR-DIRTY")) { @@ -223,8 +69,8 @@ class HierarchyCollector { return new FedoraData( pid, relations, - this.extractMetadata(DC), - fetchRdf ? this.extractFedoraDetails(RDF) : {}, + this.extractor.extractMetadata(DC), + fetchRdf ? this.extractor.extractFedoraDetails(RDF) : {}, dataStreams, extraDetails ); diff --git a/api/src/services/MetadataExtractor.ts b/api/src/services/MetadataExtractor.ts new file mode 100644 index 00000000..5e4668b7 --- /dev/null +++ b/api/src/services/MetadataExtractor.ts @@ -0,0 +1,228 @@ +import { DC } from "./Fedora"; +import { DOMParser } from "@xmldom/xmldom"; +import xpath = require("xpath"); + +class MetadataExtractor { + private static instance: MetadataExtractor; + + public static getInstance(): MetadataExtractor { + if (!MetadataExtractor.instance) { + MetadataExtractor.instance = new MetadataExtractor(); + } + return MetadataExtractor.instance; + } + + /** + * Extract Dublin Core metadata from XML. + * + * @param dc Dublin core XML + * @returns Record mapping field names to values + */ + public extractMetadata(dc: DC): Record> { + if (typeof dc.children === "undefined") { + throw new Error("Unexpected failure: childless Dublin Core!"); + } + const metadata: Record> = {}; + dc.children.forEach((field) => { + if (typeof metadata[field.name] === "undefined") { + metadata[field.name] = []; + } + metadata[field.name].push(field.value); + }); + return metadata; + } + + /** + * Extract values from RDF XML. + * + * @param xml XML to process + * @param namespaces Namespace definitions + * @param xpathQuery Xpath query to use for extraction + * @returns Record containing extracted fields to values + */ + protected extractRDFXML( + xml: Document, + namespaces: Record, + xpathQuery: string + ): Record> { + const rdfXPath = xpath.useNamespaces(namespaces); + const relations: Record> = {}; + rdfXPath(xpathQuery, xml).forEach((relation: Node) => { + let values = rdfXPath("text()", relation) as Array; + // If there's a namespace on the node name, strip it: + const nodeName = relation.nodeName.split(":").pop(); + if (values.length === 0) { + values = rdfXPath("./@rdf:resource", relation) as Array; + } + if (values.length > 0) { + if (typeof relations[nodeName] === "undefined") { + relations[nodeName] = []; + } + relations[nodeName].push(values[0].nodeValue); + } + }); + return relations; + } + + /** + * Extract relationships from RELS-EXT XML. + * + * @param RELS RELS-EXT XML + * @returns Record mapping fields to values + */ + public extractRelations(RELS: string): Record> { + const xmlParser = new DOMParser(); + const RELS_XML = xmlParser.parseFromString(RELS, "text/xml"); + return this.extractRDFXML( + RELS_XML, + { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + }, + "//rdf:Description/*" + ); + } + + /** + * Extract key details from the description of a Fedora 6 container object. + * + * @param RDF RDF XML from Fedora 6 (describing a container) + * @returns Map of extracted data + */ + public extractFedoraDetails(RDF: string): Record> { + const xmlParser = new DOMParser(); + const RDF_XML = xmlParser.parseFromString(RDF, "text/xml"); + const details = this.extractRDFXML( + RDF_XML, + { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + fedora: "http://fedora.info/definitions/v4/repository#", + "fedora3-model": "info:fedora/fedora-system:def/model#", + "fedora3-view": "info:fedora/fedora-system:def/view#", + }, + "//rdf:Description/fedora:*|//rdf:Description/fedora3-model:*|//rdf:Description/fedora3-view:*" + ); + // The new (F6) created and lastModified properties should take + // precedence over the legacy (F3) createdDate and lastModifiedDate + // properties when present. + if (typeof details.created !== "undefined") { + details.createdDate = details.created; + delete details.created; + } + if (typeof details.lastModified !== "undefined") { + details.lastModifiedDate = details.lastModified; + delete details.lastModified; + } + return details; + } + + /** + * Extract a list of binary datastreams from container RDF. + * + * @param RDF RDF XML from Fedora 6 (describing a container) + * @returns List of datastreams (binaries) inside the container + */ + public extractFedoraDatastreams(RDF: string): Array { + const xmlParser = new DOMParser(); + const RDF_XML = xmlParser.parseFromString(RDF, "text/xml"); + const raw = + this.extractRDFXML( + RDF_XML, + { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + ldp: "http://www.w3.org/ns/ldp#", + }, + "//ldp:contains" + )["contains"] ?? []; + return raw.map((ds) => { + return ds.split("/").pop(); + }); + } + + /** + * Extract a URI from license XML data. + * + * @param XML LICENSE datastream XML + * @returns License URI + */ + public extractLicense(XML: string): string { + const xmlParser = new DOMParser(); + const parsedXml = xmlParser.parseFromString(XML, "text/xml"); + const namespaces = { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + METS: "http://www.loc.gov/METS/", + xlink: "http://www.w3.org/1999/xlink", + }; + const rdfXPath = xpath.useNamespaces(namespaces); + let license = null; + rdfXPath("//@xlink:href", parsedXml).forEach((relation: Node) => { + license = relation.nodeValue; + }); + return license; + } + + /** + * Extract agent names from the AGENTS datastream. + * + * @param xml AGENTS datastream XML + * @returns List of agent names + */ + public extractAgents(xml: string): Record> { + const xmlParser = new DOMParser(); + const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); + return this.extractRDFXML( + RDF_XML, + { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + METS: "http://www.loc.gov/METS/", + }, + "//METS:agent/*" + ); + } + + /** + * Extract useful details from FITS technical metadata. + * + * @param xml FITS technical metadata + * @returns Map of extracted details + */ + public extractFitsData(xml: string): Record> { + const xmlParser = new DOMParser(); + const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); + const namespaces = { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + fits: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output", + }; + const details = this.extractRDFXML( + RDF_XML, + namespaces, + "//fits:fileinfo/fits:size|//fits:imageWidth|//fits:imageHeight" + ); + details.mimetype = []; + const fitsXPath = xpath.useNamespaces(namespaces); + fitsXPath("//fits:identity/@mimetype", RDF_XML).forEach((relation: Node) => { + details.mimetype.push(relation.nodeValue); + }); + return details; + } + + /** + * Extract information about a binary thumbnail object. + * + * @param xml Fedora 6 RDF XML describing a thumbnail binary + * @returns Map of extracted relevant details + */ + public extractThumbnailDetails(xml: string): Record> { + const xmlParser = new DOMParser(); + const RDF_XML = xmlParser.parseFromString(xml, "text/xml"); + return this.extractRDFXML( + RDF_XML, + { + rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + premis: "http://www.loc.gov/premis/rdf/v1#", + }, + "//premis:*" + ); + } +} + +export default MetadataExtractor;