Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements for referential integrity checking #980

Merged
merged 4 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ public class ReferentialIntegrityUtil {
private static ArrayList<URL> urlsParsedCumulative = new ArrayList<>(0);
private static ArrayList<String> logicalIdentifiersCumulative = new ArrayList<>(0);
private static ArrayList<String> lidOrLidVidReferencesCumulative = new ArrayList<>(0);
private static HashSet<String> lidAndFilenameCombo = new HashSet<String>();
private static HashMap<String, HashSetReferenceInfo> contextReferencesCumulative =
new HashMap<>(0); // Collect all references defined in "Context_Area" tag from all labels.
private static HashMap<String, HashSet> bundleOrCollectionReferenceMap = new HashMap<String, HashSet>(); // Collect
private static HashMap<String, HashSet<String>> bundleOrCollectionReferenceMap = new HashMap<String, HashSet<String>>(); // Collect
// all
// references
// defined
Expand Down Expand Up @@ -170,6 +171,7 @@ public static void reset() {
// and it will be difficult to figure out why. The code may work when validate
// runs from the command line
// but not in regression test.
ReferentialIntegrityUtil.lidAndFilenameCombo.clear();
ReferentialIntegrityUtil.logicalIdentifiersCumulative.clear();
ReferentialIntegrityUtil.lidOrLidVidReferencesCumulative.clear();
ReferentialIntegrityUtil.contextReferencesCumulative.clear();
Expand Down Expand Up @@ -466,35 +468,6 @@ public static void reportLidOrLidvidReferenceToNonExistLogicalReferences() {
}
}

private static boolean hasReferenceIDAndFilenameComboAdded(String singleLidorLidVidReference,
URL filename) {
boolean referenceIDAndFilenameComboAddedFlag = false;
// Build the combo of reference and filename together from input parameters.
// Remove the use of the slash '/' to avoid confusion. We are merely looking at
// the combination of the lid_reference (or lidvid_reference) plus filename as
// strings for comparison.
String referenceIDAndFilenameComboValue = singleLidorLidVidReference + filename.toString();
for (int ii = 0; ii < ReferentialIntegrityUtil.lidOrLidVidReferencesCumulative.size(); ii++) {
// Build the combo of reference and filename together from each value in
// ReferentialIntegrityUtil.lidOrLidVidReferencesCumulative.get and
// ReferentialIntegrityUtil.lidOrLidVidReferencesCumulativeFileNames.get.
// Remove the use of the slash '/' to avoid confusion. We are merely looking at
// the combination of the lid_reference (or lidvid_reference) plus filename as
// strings for comparison.
String singleComboValue = ReferentialIntegrityUtil.lidOrLidVidReferencesCumulative.get(ii)
+ ReferentialIntegrityUtil.lidOrLidVidReferencesCumulativeFileNames.get(ii);
LOG.debug(
"hasReferenceIDAndFilenameComboAdded:referenceIDAndFilenameComboValue,singleComboValue {},{}",
referenceIDAndFilenameComboValue, singleComboValue);
if (referenceIDAndFilenameComboValue.equals(singleComboValue)) {
// If there is a compare, we have found our answer and will break out of loop.
referenceIDAndFilenameComboAddedFlag = true;
break;
}
}
return (referenceIDAndFilenameComboAddedFlag);
}

private static boolean isIdentiferMatchingBundleBaseID(String singleLogicalIdentifier) {
// Given a logical identifier, check if it contains the bundle base identifier.
// If the bundle base identifier is urn:nasa:pds:kaguya_grs_spectra
Expand Down Expand Up @@ -757,10 +730,10 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun

boolean labelIsCollectionFlag = false;
boolean labelIsBundleFlag = false;
List<Target> children = new ArrayList<>();
String parentId = null;

try {
List<Target> children = new ArrayList<>();
if (getContext().getCrawler() != null) {
children = getContext().getCrawler().crawl(crawlTarget, true, getContext().getFileFilters()); // Get also the directories.
} else {
Expand Down Expand Up @@ -790,15 +763,15 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
// local_identifier and lid_reference or lidvid_reference tags.
url = child.getUrl();

if (url.toString().endsWith("." + getContext().getLabelExtension()) && TargetExaminer.isTargetALabel(url)) {
// Check this URL has been parsed before. If yes, skip this file.
if (ReferentialIntegrityUtil.urlsParsedCumulative.contains(url)) {
LOG.info("SKIPPING_URL_TRUE:referenceType,url {},{}",
ReferentialIntegrityUtil.getReferenceType(), url);
continue;
}

// Check this URL has been parsed before. If yes, skip this file.
if (ReferentialIntegrityUtil.urlsParsedCumulative.contains(url)) {
LOG.info("SKIPPING_URL_TRUE:referenceType,url {},{}",
ReferentialIntegrityUtil.getReferenceType(), url);
continue;
}
LOG.info("SKIPPING_URL_FALSE:referenceType,url {},{}",
if (url.toString().endsWith("." + getContext().getLabelExtension()) && TargetExaminer.isTargetALabel(url)) {
LOG.info("SKIPPING_URL_FALSE:referenceType,url {},{}",
ReferentialIntegrityUtil.getReferenceType(), url);
labelIsCollectionFlag = false;
labelIsBundleFlag = false;
Expand All @@ -813,18 +786,15 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
if (TargetExaminer.isTargetCollectionType (child.getUrl())) {
labelIsCollectionFlag = true;
}

xml = db.parse(url.openStream());
domSource = new DOMSource(xml);

// Note that the function getLidVidReferences() collects all references in the
// Reference_List group in Internal_Reference tags.
// so the lidOrLidVidReferencesCumulative will be a cumulative collection of all
// references collected in lidOrLidVidReferences for each label.

ArrayList<String> lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url);
ArrayList<String> logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url);

LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url,
lidOrLidVidReferences.size());
LOG.debug("additionalReferentialIntegrityChecks:url,logicalIdentifiers {},{}", url,
Expand All @@ -848,6 +818,7 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
}

if ((lidOrLidVidReferences != null) && !lidOrLidVidReferences.isEmpty()) {
String urlstr = url.toString();
for (int ii = 0; ii < lidOrLidVidReferences.size(); ii++) {
LOG.debug(
"additionalReferentialIntegrityChecks:ii,url,lidOrLidVidReferences.get(ii) {},{},[{}]",
Expand All @@ -859,7 +830,7 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
// Note that because the reference id can be the same, the combination of the id
// plus the file name will make it unique.
if (!ReferentialIntegrityUtil
.hasReferenceIDAndFilenameComboAdded(lidOrLidVidReferences.get(ii), url)) {
.lidAndFilenameCombo.contains(lidOrLidVidReferences.get(ii) + urlstr)) {

ReferentialIntegrityUtil.lidOrLidVidReferencesCumulative
.add(lidOrLidVidReferences.get(ii));
Expand All @@ -874,7 +845,7 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
// be
// referred
// to.

ReferentialIntegrityUtil.lidAndFilenameCombo.add(lidOrLidVidReferences.get(ii) + urlstr);
LOG.debug("additionalReferentialIntegrityChecks:ADDING_REFERENCE {}",
lidOrLidVidReferences.get(ii), lidOrLidVidReferencesCumulative.size());
}
Expand Down Expand Up @@ -938,7 +909,7 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
ReferentialIntegrityUtil.referenceType, crawlTarget, bundleOrCollectionReferenceMap,
bundleOrCollectionReferenceMap.size());
}

/**
* Reports an error to the validation listener.
*
Expand All @@ -949,7 +920,7 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
*/
protected static void reportError(ProblemDefinition defn, URL targetUrl, int lineNumber,
int columnNumber) {
ValidationProblem problem = new ValidationProblem(defn, new ValidationTarget(targetUrl),
ValidationProblem problem = new ValidationProblem(defn, ValidationTarget.build(targetUrl),
lineNumber, columnNumber, defn.getMessage());
problemListener.addProblem(problem);
}
Expand All @@ -965,7 +936,7 @@ protected static void reportError(ProblemDefinition defn, URL targetUrl, int lin
*/
protected static void reportError(ProblemDefinition defn, URL target, int lineNumber,
int columnNumber, String message) {
ValidationProblem problem = new ValidationProblem(defn, new ValidationTarget(target),
ValidationProblem problem = new ValidationProblem(defn, ValidationTarget.build(target),
lineNumber, columnNumber, message);
problemListener.addProblem(problem);
}
Expand Down
50 changes: 2 additions & 48 deletions src/main/java/gov/nasa/pds/tools/util/Utility.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,16 @@
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import gov.nasa.pds.tools.validate.TargetExaminer;
import gov.nasa.pds.tools.validate.TargetType;
import gov.nasa.pds.tools.validate.ValidationTarget;

/**
* Utility class.
Expand All @@ -46,15 +43,6 @@
public class Utility {
private static final Logger LOG = LoggerFactory.getLogger(Utility.class);

// A static cache of the ValidationTargets.
// There is no need to re-evaluate and/or create these
// as validation proceeds, as they are static things like
// a file or a URL.
public static HashMap<String, ValidationTarget> cachedTargets;
static {
cachedTargets = new HashMap<>();
}

// Implementation is needed since pds.nasa.gov currently uses SNI
// which is not supported in Java 6, but is supported in Java 7.
static {
Expand All @@ -70,40 +58,6 @@ public boolean verify(String hostname, javax.net.ssl.SSLSession sslSession) {
});
}

/**
* Returns a ValidationTarget for the specified target URL.
*
* If a cached target already exists in the cache, then that is returned, otherwise a new
* ValidationTarget is returned.
*
*/
public static ValidationTarget getValidationTarget(URL target) {
if (target == null) {
// for backwards-compatability with previous code, supporting the null case.
// This seems to be null in the additional context products case.
return new ValidationTarget(null);
}
ValidationTarget valTarget = cachedTargets.get(target.toString());
if (valTarget == null) {
valTarget = new ValidationTarget(target);
cachedTargets.put(target.toString(), valTarget);
}
return valTarget;
}
public static ValidationTarget getValidationTarget(URL source, URL label) {
if (source == null) {
// for backwards-compatability with previous code, supporting the null case.
// This seems to be null in the additional context products case.
return new ValidationTarget(null);
}
ValidationTarget valTarget = cachedTargets.get(source.toString());
if (valTarget == null) {
valTarget = new ValidationTarget(source, label);
cachedTargets.put(source.toString(), valTarget);
}
return valTarget;
}

/**
* Method that opens a connection. Supports redirects.
*
Expand All @@ -126,8 +80,8 @@ public static InputStream openConnection(URLConnection conn) throws IOException
SSLContext context = SSLContext.getInstance("TLSv1.2");
context.init(null, null, new java.security.SecureRandom());
HttpsURLConnection test = (HttpsURLConnection) conn;
SSLSocketFactory sf = test.getSSLSocketFactory();
SSLSocketFactory d = HttpsURLConnection.getDefaultSSLSocketFactory();
test.getSSLSocketFactory();
HttpsURLConnection.getDefaultSSLSocketFactory();
((HttpsURLConnection) conn).setSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
throw new IOException(e.getMessage());
Expand Down
48 changes: 18 additions & 30 deletions src/main/java/gov/nasa/pds/tools/validate/Identifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,26 @@
public class Identifier {

/** The logical identifier. */
private String lid;
final private String lid;

/** The version. */
private String version;
final private String version;

/** Flag to indicate if a version exists. */
private boolean hasVersion;
final private boolean hasVersion;

final private String representation;

public Identifier(String id) {
this(id, null);
}

public Identifier(String lid, String version) {
if (lid == null) throw new IllegalArgumentException("cannot be an identifier if the lid is null");
this.hasVersion = version != null;
this.lid = lid;
this.representation = lid + "::" + (version == null ? "-1.-1" : version);
this.version = version;
if (this.version == null) {
hasVersion = false;
} else {
hasVersion = true;
}
}

public String getLid() {
Expand All @@ -66,36 +66,24 @@ public String toString() {
}

/**
* Determines where 2 LIDVIDs are equal.
* Determines where 2 LIDVIDs are near neighbors (equal in some cases).
*
*/
public boolean nearNeighbor(Identifier identifier) {
return this.lid.equals(identifier.lid) &&
(this.version == null || identifier.version == null || this.version.equals(identifier.version));
}

@Override
public boolean equals(Object o) {
boolean isEqual = false;
if (o instanceof Identifier) {
Identifier identifier = (Identifier) o;
if (this.lid.equals(identifier.getLid())) {
if (this.hasVersion) {
if (identifier.hasVersion() && this.version.equals(identifier.getVersion())) {
isEqual = true;
}
} else {
isEqual = true;
}
}
Identifier i = (Identifier)o;
return this.representation.equals(i.representation);
}
return isEqual;
return false;
}

@Override
public int hashCode() {
final int prime = 31;
int result = 17;
// result = prime * result
// + (hasVersion ? 0 : 1);
result = prime * result + ((lid == null) ? 0 : lid.hashCode());
// result = prime * result
// + ((version == null) ? 0 : version.hashCode());
return result;
return this.representation.hashCode();
}
}
Loading