Skip to content

Commit

Permalink
Refactor metadata extraction to a separate class. (#83)
Browse files Browse the repository at this point in the history
looks great thanks!
  • Loading branch information
demiankatz authored Sep 23, 2021
1 parent c4600e1 commit ac0b46d
Show file tree
Hide file tree
Showing 2 changed files with 246 additions and 172 deletions.
190 changes: 18 additions & 172 deletions api/src/services/HierarchyCollector.ts
Original file line number Diff line number Diff line change
@@ -1,187 +1,33 @@
import { DC, Fedora } from "./Fedora";
import { Fedora } from "./Fedora";
import Config from "../models/Config";
import FedoraData from "../models/FedoraData";
import { DOMParser } from "@xmldom/xmldom";
import xpath = require("xpath");
import MetadataExtractor from "./MetadataExtractor";
import TikaExtractor from "./TikaExtractor";

class HierarchyCollector {
private static instance: HierarchyCollector;

fedora: Fedora;
// PIDs that define the top of a hierarchy. Typically this
// includes the overall top PID, plus the top public PID.
hierarchyTops: Array<string>;
extractor: MetadataExtractor;
config: Config;

constructor(fedora: Fedora, config: Config) {
constructor(fedora: Fedora, extractor: MetadataExtractor, config: Config) {
this.fedora = fedora;
this.extractor = extractor;
this.config = config;
}

public static getInstance(): HierarchyCollector {
if (!HierarchyCollector.instance) {
HierarchyCollector.instance = new HierarchyCollector(Fedora.getInstance(), Config.getInstance());
HierarchyCollector.instance = new HierarchyCollector(
Fedora.getInstance(),
MetadataExtractor.getInstance(),
Config.getInstance()
);
}
return HierarchyCollector.instance;
}

protected extractMetadata(dc: DC): Record<string, Array<string>> {
if (typeof dc.children === "undefined") {
throw new Error("Unexpected failure: childless Dublin Core!");
}
const metadata: Record<string, Array<string>> = {};
dc.children.forEach((field) => {
if (typeof metadata[field.name] === "undefined") {
metadata[field.name] = [];
}
metadata[field.name].push(field.value);
});
return metadata;
}

protected extractRDFXML(
xml: Document,
namespaces: Record<string, string>,
xpathQuery: string
): Record<string, Array<string>> {
const rdfXPath = xpath.useNamespaces(namespaces);
const relations: Record<string, Array<string>> = {};
rdfXPath(xpathQuery, xml).forEach((relation: Node) => {
let values = rdfXPath("text()", relation) as Array<Node>;
// If there's a namespace on the node name, strip it:
const nodeName = relation.nodeName.split(":").pop();
if (values.length === 0) {
values = rdfXPath("./@rdf:resource", relation) as Array<Node>;
}
if (values.length > 0) {
if (typeof relations[nodeName] === "undefined") {
relations[nodeName] = [];
}
relations[nodeName].push(values[0].nodeValue);
}
});
return relations;
}

protected extractRelations(RELS: string): Record<string, Array<string>> {
const xmlParser = new DOMParser();
const RELS_XML = xmlParser.parseFromString(RELS, "text/xml");
return this.extractRDFXML(
RELS_XML,
{
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
},
"//rdf:Description/*"
);
}

protected extractFedoraDetails(RDF: string): Record<string, Array<string>> {
const xmlParser = new DOMParser();
const RDF_XML = xmlParser.parseFromString(RDF, "text/xml");
const details = this.extractRDFXML(
RDF_XML,
{
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
fedora: "http://fedora.info/definitions/v4/repository#",
"fedora3-model": "info:fedora/fedora-system:def/model#",
"fedora3-view": "info:fedora/fedora-system:def/view#",
},
"//rdf:Description/fedora:*|//rdf:Description/fedora3-model:*|//rdf:Description/fedora3-view:*"
);
// The new (F6) created and lastModified properties should take
// precedence over the legacy (F3) createdDate and lastModifiedDate
// properties when present.
if (typeof details.created !== "undefined") {
details.createdDate = details.created;
delete details.created;
}
if (typeof details.lastModified !== "undefined") {
details.lastModifiedDate = details.lastModified;
delete details.lastModified;
}
return details;
}

protected extractFedoraDatastreams(RDF: string): Array<string> {
const xmlParser = new DOMParser();
const RDF_XML = xmlParser.parseFromString(RDF, "text/xml");
const raw =
this.extractRDFXML(
RDF_XML,
{
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
ldp: "http://www.w3.org/ns/ldp#",
},
"//ldp:contains"
)["contains"] ?? [];
return raw.map((ds) => {
return ds.split("/").pop();
});
}

protected extractLicense(XML: string): string {
const xmlParser = new DOMParser();
const parsedXml = xmlParser.parseFromString(XML, "text/xml");
const namespaces = {
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
METS: "http://www.loc.gov/METS/",
xlink: "http://www.w3.org/1999/xlink",
};
const rdfXPath = xpath.useNamespaces(namespaces);
let license = null;
rdfXPath("//@xlink:href", parsedXml).forEach((relation: Node) => {
license = relation.nodeValue;
});
return license;
}

protected extractAgents(xml: string): Record<string, Array<string>> {
const xmlParser = new DOMParser();
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
return this.extractRDFXML(
RDF_XML,
{
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
METS: "http://www.loc.gov/METS/",
},
"//METS:agent/*"
);
}

protected extractFitsData(xml: string): Record<string, Array<string>> {
const xmlParser = new DOMParser();
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
const namespaces = {
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
fits: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
};
const details = this.extractRDFXML(
RDF_XML,
namespaces,
"//fits:fileinfo/fits:size|//fits:imageWidth|//fits:imageHeight"
);
details.mimetype = [];
const fitsXPath = xpath.useNamespaces(namespaces);
fitsXPath("//fits:identity/@mimetype", RDF_XML).forEach((relation: Node) => {
details.mimetype.push(relation.nodeValue);
});
return details;
}

protected extractThumbnailDetails(xml: string): Record<string, Array<string>> {
const xmlParser = new DOMParser();
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
return this.extractRDFXML(
RDF_XML,
{
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
premis: "http://www.loc.gov/premis/rdf/v1#",
},
"//premis:*"
);
}

async getFedoraData(pid: string, fetchRdf = true): Promise<FedoraData> {
// Use Fedora to get data
const DCPromise = this.fedora.getDC(pid);
Expand All @@ -191,25 +37,25 @@ class HierarchyCollector {
// we can skip fetching more RDF in order to save some time!
const RDFPromise = fetchRdf ? this.fedora.getRdf(pid) : null;
const [DC, RELS, RDF] = await Promise.all([DCPromise, RELSPromise, RDFPromise]);
const dataStreams = fetchRdf ? this.extractFedoraDatastreams(RDF) : [];
const relations = this.extractRelations(RELS);
const dataStreams = fetchRdf ? this.extractor.extractFedoraDatastreams(RDF) : [];
const relations = this.extractor.extractRelations(RELS);
// Fetch license details if appropriate/available:
const extraDetails: Record<string, Record<string, Array<string>>> = {};
if (dataStreams.includes("LICENSE")) {
const licenseStream = await this.fedora.getDatastreamAsString(pid, "LICENSE");
extraDetails.license = { url: [this.extractLicense(licenseStream)] };
extraDetails.license = { url: [this.extractor.extractLicense(licenseStream)] };
}
if (dataStreams.includes("AGENTS")) {
const agentsStream = await this.fedora.getDatastreamAsString(pid, "AGENTS");
extraDetails.agents = this.extractAgents(agentsStream);
extraDetails.agents = this.extractor.extractAgents(agentsStream);
}
if (dataStreams.includes("THUMBNAIL")) {
const thumbRdf = await this.fedora.getRdf(pid + "/THUMBNAIL/fcr:metadata");
extraDetails.thumbnails = this.extractThumbnailDetails(thumbRdf);
extraDetails.thumbnails = this.extractor.extractThumbnailDetails(thumbRdf);
}
if (dataStreams.includes("MASTER-MD")) {
const fitsXml = await this.fedora.getDatastreamAsString(pid, "MASTER-MD");
extraDetails.fitsData = this.extractFitsData(fitsXml);
extraDetails.fitsData = this.extractor.extractFitsData(fitsXml);
}
extraDetails.fullText = {};
if (dataStreams.includes("OCR-DIRTY")) {
Expand All @@ -223,8 +69,8 @@ class HierarchyCollector {
return new FedoraData(
pid,
relations,
this.extractMetadata(DC),
fetchRdf ? this.extractFedoraDetails(RDF) : {},
this.extractor.extractMetadata(DC),
fetchRdf ? this.extractor.extractFedoraDetails(RDF) : {},
dataStreams,
extraDetails
);
Expand Down
Loading

0 comments on commit ac0b46d

Please sign in to comment.