diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..2b6a315
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ae3c172
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/bin/
diff --git a/.project b/.project
new file mode 100644
index 0000000..a4afeb1
--- /dev/null
+++ b/.project
@@ -0,0 +1,17 @@
+
+
+ WebCrawler
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+
+
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..838bd9d
--- /dev/null
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/external/lib/jsoup-1.7.3.jar b/external/lib/jsoup-1.7.3.jar
new file mode 100644
index 0000000..aa5c798
Binary files /dev/null and b/external/lib/jsoup-1.7.3.jar differ
diff --git a/readme.txt b/readme.txt
new file mode 100644
index 0000000..676371b
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,55 @@
+To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is
+also set, run from the src folder
+
+javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java
+
+Then run from the same folder with
+
+java WebCrawlerMain
+
+Even better import the project into eclipse and run it from there.
+
+The crawling algorithm is breadth first search when single threaded
+
+If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links
+found by any other thread.
+
+Adjustable params:
+
+In WebCrawlerMain.java
+
+private static final int NUM_SEARCH_RESULTS = 3;
+
+controls how many google search results to use to seed the topical crawling
+(an equal number of threads will be spawned)
+
+In WebCrawlThread.java
+
+final int MAX_NUM_LINKS_TO_CRAWL = 1000;
+
+controls the maximum number of urls to crawl
+
+and
+
+final int MAX_DEPTH = 1000;
+
+how many links from each page to retrieve
+
+The data structures
+
+private static HashSet crawledUrls = new HashSet();
+private static LinkedHashSet urlsToCrawl = new LinkedHashSet();
+
+in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions
+among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl
+(this set keeps insertion order), remove it and added it to the set crawledUrls.
+
+Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url.
+
+
+
+
+
+
+
+
diff --git a/src/ContentCrawlType.java b/src/ContentCrawlType.java
new file mode 100644
index 0000000..7d7c2f5
--- /dev/null
+++ b/src/ContentCrawlType.java
@@ -0,0 +1,6 @@
+public enum ContentCrawlType
+{
+ GENERAL_CRAWL,
+ FOCUSED_CRAWL_DOMAIN,
+ FOCUSED_CRAWL_TOPIC;
+}
diff --git a/src/SynchronizedManager.java b/src/SynchronizedManager.java
new file mode 100644
index 0000000..bfe6d29
--- /dev/null
+++ b/src/SynchronizedManager.java
@@ -0,0 +1,43 @@
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+
+public class SynchronizedManager
+{
+ synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet urlsToCrawl)
+ {
+ urlsToCrawl.add(url);
+ }
+
+ synchronized void addAllToUrlsToCrawlSet(List links, LinkedHashSet urlsToCrawl)
+ {
+ urlsToCrawl.addAll(links);
+ }
+
+ synchronized String getFromUrlsToCrawlSet(LinkedHashSet urlsToCrawl, HashSet crawledUrls)
+ {
+ String url = urlsToCrawl.iterator().next();
+ //remove the page from the list of urls to crawl
+ urlsToCrawl.remove(url);
+
+ // Add the page to the list of crawled URLs so that this page is not crawled again by a different thread
+ crawledUrls.add(url);
+
+ return url;
+ }
+
+ synchronized boolean keepCrawling(LinkedHashSet urlsToCrawl, HashSet crawledUrls, int maxLinksToCrawl)
+ {
+ return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl;
+ }
+
+ synchronized boolean crawledUrlsSetContainsLink(String link, HashSet crawledUrls)
+ {
+ return crawledUrls.contains(link);
+ }
+
+ synchronized int crawledUrlsSetSize(HashSet crawledUrls)
+ {
+ return crawledUrls.size();
+ }
+}
diff --git a/src/UrlResolver.java b/src/UrlResolver.java
new file mode 100644
index 0000000..a2dc5a1
--- /dev/null
+++ b/src/UrlResolver.java
@@ -0,0 +1,459 @@
+/**
+ * This class is adopted from Htmlunit with the following copyright:
+ *
+ * Copyright (c) 2002-2012 Gargoyle Software Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+public final class UrlResolver {
+ /**
+ * Resolves a given relative URL against a base URL. See
+ * RFC1808
+ * Section 4 for more details.
+ *
+ * @param baseUrl The base URL in which to resolve the specification.
+ * @param relativeUrl The relative URL to resolve against the base URL.
+ * @return the resolved specification.
+ */
+ public static String resolveUrl(final String baseUrl, final String relativeUrl) {
+ if (baseUrl == null) {
+ throw new IllegalArgumentException("Base URL must not be null");
+ }
+ if (relativeUrl == null) {
+ throw new IllegalArgumentException("Relative URL must not be null");
+ }
+ final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
+
+ return url.toString();
+ }
+
+ /**
+ * Returns the index within the specified string of the first occurrence of
+ * the specified search character.
+ *
+ * @param s the string to search
+ * @param searchChar the character to search for
+ * @param beginIndex the index at which to start the search
+ * @param endIndex the index at which to stop the search
+ * @return the index of the first occurrence of the character in the string or -1
+ */
+ private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) {
+ for (int i = beginIndex; i < endIndex; i++) {
+ if (s.charAt(i) == searchChar) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Parses a given specification using the algorithm depicted in
+ * RFC1808:
+ *
+ * Section 2.4: Parsing a URL
+ *
+ * An accepted method for parsing URLs is useful to clarify the
+ * generic-RL syntax of Section 2.2 and to describe the algorithm for
+ * resolving relative URLs presented in Section 4. This section
+ * describes the parsing rules for breaking down a URL (relative or
+ * absolute) into the component parts described in Section 2.1. The
+ * rules assume that the URL has already been separated from any
+ * surrounding text and copied to a "parse string". The rules are
+ * listed in the order in which they would be applied by the parser.
+ *
+ * @param spec The specification to parse.
+ * @return the parsed specification.
+ */
+ private static Url parseUrl(final String spec) {
+ final Url url = new Url();
+ int startIndex = 0;
+ int endIndex = spec.length();
+
+ // Section 2.4.1: Parsing the Fragment Identifier
+ //
+ // If the parse string contains a crosshatch "#" character, then the
+ // substring after the first (left-most) crosshatch "#" and up to the
+ // end of the parse string is the identifier. If the
+ // crosshatch is the last character, or no crosshatch is present, then
+ // the fragment identifier is empty. The matched substring, including
+ // the crosshatch character, is removed from the parse string before
+ // continuing.
+ //
+ // Note that the fragment identifier is not considered part of the URL.
+ // However, since it is often attached to the URL, parsers must be able
+ // to recognize and set aside fragment identifiers as part of the
+ // process.
+ final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex);
+
+ if (crosshatchIndex >= 0) {
+ url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
+ endIndex = crosshatchIndex;
+ }
+ // Section 2.4.2: Parsing the Scheme
+ //
+ // If the parse string contains a colon ":" after the first character
+ // and before any characters not allowed as part of a scheme name (i.e.,
+ // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
+ // of the URL is the substring of characters up to but not
+ // including the first colon. These characters and the colon are then
+ // removed from the parse string before continuing.
+ final int colonIndex = indexOf(spec, ':', startIndex, endIndex);
+
+ if (colonIndex > 0) {
+ final String scheme = spec.substring(startIndex, colonIndex);
+ if (isValidScheme(scheme)) {
+ url.scheme_ = scheme;
+ startIndex = colonIndex + 1;
+ }
+ }
+ // Section 2.4.3: Parsing the Network Location/Login
+ //
+ // If the parse string begins with a double-slash "//", then the
+ // substring of characters after the double-slash and up to, but not
+ // including, the next slash "/" character is the network location/login
+ // () of the URL. If no trailing slash "/" is present, the
+ // entire remaining parse string is assigned to . The double-
+ // slash and are removed from the parse string before
+ // continuing.
+ //
+ // Note: We also accept a question mark "?" or a semicolon ";" character as
+ // delimiters for the network location/login () of the URL.
+ final int locationStartIndex;
+ int locationEndIndex;
+
+ if (spec.startsWith("//", startIndex)) {
+ locationStartIndex = startIndex + 2;
+ locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex);
+ if (locationEndIndex >= 0) {
+ startIndex = locationEndIndex;
+ }
+ }
+ else {
+ locationStartIndex = -1;
+ locationEndIndex = -1;
+ }
+ // Section 2.4.4: Parsing the Query Information
+ //
+ // If the parse string contains a question mark "?" character, then the
+ // substring after the first (left-most) question mark "?" and up to the
+ // end of the parse string is the information. If the question
+ // mark is the last character, or no question mark is present, then the
+ // query information is empty. The matched substring, including the
+ // question mark character, is removed from the parse string before
+ // continuing.
+ final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex);
+
+ if (questionMarkIndex >= 0) {
+ if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+ // The substring of characters after the double-slash and up to, but not
+ // including, the question mark "?" character is the network location/login
+ // () of the URL.
+ locationEndIndex = questionMarkIndex;
+ startIndex = questionMarkIndex;
+ }
+ url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
+ endIndex = questionMarkIndex;
+ }
+ // Section 2.4.5: Parsing the Parameters
+ //
+ // If the parse string contains a semicolon ";" character, then the
+ // substring after the first (left-most) semicolon ";" and up to the end
+ // of the parse string is the parameters (). If the semicolon
+ // is the last character, or no semicolon is present, then is
+ // empty. The matched substring, including the semicolon character, is
+ // removed from the parse string before continuing.
+ final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex);
+
+ if (semicolonIndex >= 0) {
+ if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+ // The substring of characters after the double-slash and up to, but not
+ // including, the semicolon ";" character is the network location/login
+ // () of the URL.
+ locationEndIndex = semicolonIndex;
+ startIndex = semicolonIndex;
+ }
+ url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
+ endIndex = semicolonIndex;
+ }
+ // Section 2.4.6: Parsing the Path
+ //
+ // After the above steps, all that is left of the parse string is the
+ // URL and the slash "/" that may precede it. Even though the
+ // initial slash is not part of the URL path, the parser must remember
+ // whether or not it was present so that later processes can
+ // differentiate between relative and absolute paths. Often this is
+ // done by simply storing the preceding slash along with the path.
+ if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+ // The entire remaining parse string is assigned to the network
+ // location/login () of the URL.
+ locationEndIndex = endIndex;
+ }
+ else if (startIndex < endIndex) {
+ url.path_ = spec.substring(startIndex, endIndex);
+ }
+ // Set the network location/login () of the URL.
+ if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
+ url.location_ = spec.substring(locationStartIndex, locationEndIndex);
+ }
+ return url;
+ }
+
+ /*
+ * Returns true if specified string is a valid scheme name.
+ */
+ private static boolean isValidScheme(final String scheme) {
+ final int length = scheme.length();
+ if (length < 1) {
+ return false;
+ }
+ char c = scheme.charAt(0);
+ if (!Character.isLetter(c)) {
+ return false;
+ }
+ for (int i = 1; i < length; i++) {
+ c = scheme.charAt(i);
+ if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Resolves a given relative URL against a base URL using the algorithm
+ * depicted in RFC1808:
+ *
+ * Section 4: Resolving Relative URLs
+ *
+ * This section describes an example algorithm for resolving URLs within
+ * a context in which the URLs may be relative, such that the result is
+ * always a URL in absolute form. Although this algorithm cannot
+ * guarantee that the resulting URL will equal that intended by the
+ * original author, it does guarantee that any valid URL (relative or
+ * absolute) can be consistently transformed to an absolute form given a
+ * valid base URL.
+ *
+ * @param baseUrl The base URL in which to resolve the specification.
+ * @param relativeUrl The relative URL to resolve against the base URL.
+ * @return the resolved specification.
+ */
+ private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
+ final Url url = parseUrl(relativeUrl);
+ // Step 1: The base URL is established according to the rules of
+ // Section 3. If the base URL is the empty string (unknown),
+ // the embedded URL is interpreted as an absolute URL and
+ // we are done.
+ if (baseUrl == null) {
+ return url;
+ }
+ // Step 2: Both the base and embedded URLs are parsed into their
+ // component parts as described in Section 2.4.
+ // a) If the embedded URL is entirely empty, it inherits the
+ // entire base URL (i.e., is set equal to the base URL)
+ // and we are done.
+ if (relativeUrl.length() == 0) {
+ return new Url(baseUrl);
+ }
+ // b) If the embedded URL starts with a scheme name, it is
+ // interpreted as an absolute URL and we are done.
+ if (url.scheme_ != null) {
+ return url;
+ }
+ // c) Otherwise, the embedded URL inherits the scheme of
+ // the base URL.
+ url.scheme_ = baseUrl.scheme_;
+ // Step 3: If the embedded URL's is non-empty, we skip to
+ // Step 7. Otherwise, the embedded URL inherits the
+ // (if any) of the base URL.
+ if (url.location_ != null) {
+ return url;
+ }
+ url.location_ = baseUrl.location_;
+ // Step 4: If the embedded URL path is preceded by a slash "/", the
+ // path is not relative and we skip to Step 7.
+ if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) {
+ url.path_ = removeLeadingSlashPoints(url.path_);
+ return url;
+ }
+ // Step 5: If the embedded URL path is empty (and not preceded by a
+ // slash), then the embedded URL inherits the base URL path,
+ // and
+ if (url.path_ == null) {
+ url.path_ = baseUrl.path_;
+ // a) if the embedded URL's is non-empty, we skip to
+ // step 7; otherwise, it inherits the of the base
+ // URL (if any) and
+ if (url.parameters_ != null) {
+ return url;
+ }
+ url.parameters_ = baseUrl.parameters_;
+ // b) if the embedded URL's is non-empty, we skip to
+ // step 7; otherwise, it inherits the of the base
+ // URL (if any) and we skip to step 7.
+ if (url.query_ != null) {
+ return url;
+ }
+ url.query_ = baseUrl.query_;
+ return url;
+ }
+ // Step 6: The last segment of the base URL's path (anything
+ // following the rightmost slash "/", or the entire path if no
+ // slash is present) is removed and the embedded URL's path is
+ // appended in its place. The following operations are
+ // then applied, in order, to the new path:
+ final String basePath = baseUrl.path_;
+ String path = "";
+
+ if (basePath != null) {
+ final int lastSlashIndex = basePath.lastIndexOf('/');
+
+ if (lastSlashIndex >= 0) {
+ path = basePath.substring(0, lastSlashIndex + 1);
+ }
+ }
+ else {
+ path = "/";
+ }
+ path = path.concat(url.path_);
+ // a) All occurrences of "./", where "." is a complete path
+ // segment, are removed.
+ int pathSegmentIndex;
+
+ while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
+ path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
+ }
+ // b) If the path ends with "." as a complete path segment,
+ // that "." is removed.
+ if (path.endsWith("/.")) {
+ path = path.substring(0, path.length() - 1);
+ }
+ // c) All occurrences of "/../", where is a
+ // complete path segment not equal to "..", are removed.
+ // Removal of these path segments is performed iteratively,
+ // removing the leftmost matching pattern on each iteration,
+ // until no matching pattern remains.
+ while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
+ final String pathSegment = path.substring(0, pathSegmentIndex);
+ final int slashIndex = pathSegment.lastIndexOf('/');
+
+ if (slashIndex < 0) {
+ continue;
+ }
+ if (!"..".equals(pathSegment.substring(slashIndex))) {
+ path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
+ }
+ }
+ // d) If the path ends with "/..", where is a
+ // complete path segment not equal to "..", that
+ // "/.." is removed.
+ if (path.endsWith("/..")) {
+ final String pathSegment = path.substring(0, path.length() - 3);
+ final int slashIndex = pathSegment.lastIndexOf('/');
+
+ if (slashIndex >= 0) {
+ path = path.substring(0, slashIndex + 1);
+ }
+ }
+
+ path = removeLeadingSlashPoints(path);
+
+ url.path_ = path;
+ // Step 7: The resulting URL components, including any inherited from
+ // the base URL, are recombined to give the absolute form of
+ // the embedded URL.
+ return url;
+ }
+
+ /**
+ * "/.." at the beginning should be removed as browsers do (not in RFC)
+ */
+ private static String removeLeadingSlashPoints(String path) {
+ while (path.startsWith("/..")) {
+ path = path.substring(3);
+ }
+
+ return path;
+ }
+
+ /**
+ * Class Url represents a Uniform Resource Locator.
+ *
+ * @author Martin Tamme
+ */
+ private static class Url {
+
+ String scheme_;
+ String location_;
+ String path_;
+ String parameters_;
+ String query_;
+ String fragment_;
+
+ /**
+ * Creates a Url object.
+ */
+ public Url() {
+ }
+
+ /**
+ * Creates a Url object from the specified
+ * Url object.
+ *
+ * @param url a Url object.
+ */
+ public Url(final Url url) {
+ scheme_ = url.scheme_;
+ location_ = url.location_;
+ path_ = url.path_;
+ parameters_ = url.parameters_;
+ query_ = url.query_;
+ fragment_ = url.fragment_;
+ }
+
+ /**
+ * Returns a string representation of the Url object.
+ *
+ * @return a string representation of the Url object.
+ */
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder();
+
+ if (scheme_ != null) {
+ sb.append(scheme_);
+ sb.append(':');
+ }
+ if (location_ != null) {
+ sb.append("//");
+ sb.append(location_);
+ }
+ if (path_ != null) {
+ sb.append(path_);
+ }
+ if (parameters_ != null) {
+ sb.append(';');
+ sb.append(parameters_);
+ }
+ if (query_ != null) {
+ sb.append('?');
+ sb.append(query_);
+ }
+ if (fragment_ != null) {
+ sb.append('#');
+ sb.append(fragment_);
+ }
+ return sb.toString();
+ }
+ }
+}
diff --git a/src/Utils.java b/src/Utils.java
new file mode 100644
index 0000000..69bc82a
--- /dev/null
+++ b/src/Utils.java
@@ -0,0 +1,250 @@
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+
+public class Utils
+{
+ public static URL checkUrl(String url)
+ {
+ if (!url.toLowerCase().startsWith("http"))
+ {
+ return null;
+ }
+
+ URL checkedUrl = null;
+ try
+ {
+ checkedUrl = new URL(url);
+ }
+ catch (MalformedURLException e)
+ {
+ return null;
+ }
+
+ return checkedUrl;
+ }
+
+ public static String removeTrailingSlash(String url)
+ {
+ if (url.endsWith("/"))
+ {
+ url = url.substring(0,url.length()-1);
+ }
+
+ return url;
+ }
+
+
+ public static String getCanonicalURL(String url)
+ {
+ return getCanonicalURL(url, null);
+ }
+
+ public static String getCanonicalURL(String href, String context)
+ {
+ try
+ {
+ URL canonicalURL = new URL(UrlResolver.resolveUrl(context == null ? "" : context, href));
+
+ String host = canonicalURL.getHost().toLowerCase();
+ if (host == "")
+ {
+ // This is an invalid Url.
+ return null;
+ }
+
+ String path = canonicalURL.getPath();
+
+ /*
+ * Normalize: no empty segments (i.e., "//"), no segments equal to
+ * ".", and no segments equal to ".." that are preceded by a segment
+ * not equal to "..".
+ */
+ path = new URI(path.replace("\\", "/")).normalize().toString();
+
+ /*
+ * Convert '//' -> '/'
+ */
+ int idx = path.indexOf("//");
+ while (idx >= 0)
+ {
+ path = path.replace("//", "/");
+ idx = path.indexOf("//");
+ }
+
+ /*
+ * Drop starting '/../'
+ */
+ while (path.startsWith("/../"))
+ {
+ path = path.substring(3);
+ }
+
+ /*
+ * Trim
+ */
+ path = path.trim();
+
+ final SortedMap params = createParameterMap(canonicalURL.getQuery());
+ final String queryString;
+
+ if (params != null && params.size() > 0)
+ {
+ String canonicalParams = canonicalize(params);
+ queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);
+ }
+ else
+ {
+ queryString = "";
+ }
+
+ /*
+ * Add starting slash if needed
+ */
+ if (path.length() == 0)
+ {
+ path = "/" + path;
+ }
+
+ /*
+ * Drop default port: example.com:80 -> example.com
+ */
+ int port = canonicalURL.getPort();
+ if (port == canonicalURL.getDefaultPort())
+ {
+ port = -1;
+ }
+
+ String protocol = canonicalURL.getProtocol().toLowerCase();
+ String pathAndQueryString = normalizePath(path) + queryString;
+
+ URL result = new URL(protocol, host, port, pathAndQueryString);
+ return result.toExternalForm();
+
+ }
+ catch (MalformedURLException ex)
+ {
+ return null;
+ }
+ catch (URISyntaxException ex)
+ {
+ return null;
+ }
+ }
+
+ /**
+ * Takes a query string, separates the constituent name-value pairs, and
+ * stores them in a SortedMap ordered by lexicographical order.
+ *
+ * @return Null if there is no query string.
+ */
+ private static SortedMap createParameterMap(final String queryString)
+ {
+ if (queryString == null || queryString.isEmpty())
+ {
+ return null;
+ }
+
+ final String[] pairs = queryString.split("&");
+ final Map params = new HashMap<>(pairs.length);
+
+ for (final String pair : pairs)
+ {
+ if (pair.length() == 0)
+ {
+ continue;
+ }
+
+ String[] tokens = pair.split("=", 2);
+ switch (tokens.length)
+ {
+ case 1:
+ if (pair.charAt(0) == '=')
+ {
+ params.put("", tokens[0]);
+ }
+ else
+ {
+ params.put(tokens[0], "");
+ }
+ break;
+ case 2:
+ params.put(tokens[0], tokens[1]);
+ break;
+ }
+ }
+ return new TreeMap<>(params);
+ }
+
+ /**
+ * Canonicalize the query string.
+ *
+ * @param sortedParamMap
+ * Parameter name-value pairs in lexicographical order.
+ * @return Canonical form of query string.
+ */
+ private static String canonicalize(final SortedMap sortedParamMap)
+ {
+ if (sortedParamMap == null || sortedParamMap.isEmpty())
+ {
+ return "";
+ }
+
+ final StringBuffer sb = new StringBuffer(100);
+ for (Map.Entry pair : sortedParamMap.entrySet())
+ {
+ final String key = pair.getKey().toLowerCase();
+ if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid"))
+ {
+ continue;
+ }
+ if (sb.length() > 0)
+ {
+ sb.append('&');
+ }
+ sb.append(percentEncodeRfc3986(pair.getKey()));
+ if (!pair.getValue().isEmpty())
+ {
+ sb.append('=');
+ sb.append(percentEncodeRfc3986(pair.getValue()));
+ }
+ }
+ return sb.toString();
+ }
+ /**
+ * Percent-encode values according the RFC 3986. The built-in Java
+ * URLEncoder does not encode according to the RFC, so we make the extra
+ * replacements.
+ *
+ * @param string
+ * Decoded string.
+ * @return Encoded string per RFC 3986.
+ */
+ private static String percentEncodeRfc3986(String string)
+ {
+ try
+ {
+ string = string.replace("+", "%2B");
+ string = URLDecoder.decode(string, "UTF-8");
+ string = URLEncoder.encode(string, "UTF-8");
+ return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
+ }
+ catch (Exception e)
+ {
+ return string;
+ }
+ }
+
+ private static String normalizePath(final String path)
+ {
+ return path.replace("%7E", "~").replace(" ", "%20");
+ }
+}
diff --git a/src/WebCrawlThread.java b/src/WebCrawlThread.java
new file mode 100644
index 0000000..b97f890
--- /dev/null
+++ b/src/WebCrawlThread.java
@@ -0,0 +1,366 @@
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Thread crawls from a start url
+ * @author zona
+ *
+ */
+public class WebCrawlThread extends Thread
+{
+ //absolute maximum number of urls to crawl
+ final int MAX_NUM_LINKS_TO_CRAWL = 1000;
+ final int MAX_DEPTH = 1000;
+ private ContentCrawlType crawlMode;
+ private String startUrl;
+ private int threadID;
+ private SynchronizedManager synchronizedManager;
+
+ //cache with disallowed urls as per the robots.txt file. The key is the host url
+ //i.e. www.cnn.com, and the values are all urls disallowed for www.cnn.com
+ private static Map> disallowListCache = new HashMap<>();
+
+ // Crawl lists
+ private static HashSet crawledUrls = new HashSet();
+ private static LinkedHashSet urlsToCrawl = new LinkedHashSet();
+
+ public WebCrawlThread(ContentCrawlType aCrawlMode,String aStartUrl, int aThreadID)
+ {
+ this.crawlMode = aCrawlMode;
+ this.startUrl = aStartUrl;
+ this.threadID = aThreadID;
+ this.synchronizedManager = new SynchronizedManager();
+ }
+
+ public void init()
+ {
+ this.start();
+ }
+
+
+ // Check if robot is allowed to access the given URL.
+ //Different threads can be accessing the map so make it synchronized to prevent race conditions
+ //But it makes it a bottleneck
+ /**
+ *
+ * @param urlToCheck
+ * @return
+ */
+ private synchronized boolean isRobotAllowed(URL urlToCheck)
+ {
+ String host = urlToCheck.getHost().toLowerCase();
+
+ // Retrieve host's disallow list from cache.
+ List disallowList = disallowListCache.get(host);
+
+ // If list is not in the cache, download and cache it.
+ if (disallowList == null)
+ {
+ disallowList = new ArrayList();
+ try
+ {
+ URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
+
+ // Open connection to robot file URL for reading.
+ BufferedReader reader = new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
+
+ // Read robot file, creating list of disallowed paths.
+ String line;
+ while ((line = reader.readLine()) != null)
+ {
+ if (line.toLowerCase().indexOf("Disallow:") == 0)
+ {
+ String disallowPath = line.toLowerCase().substring("Disallow:".length());
+ // Check disallow path for comments and remove if present.
+ int commentIndex = disallowPath.indexOf("#");
+ if (commentIndex != - 1)
+ {
+ disallowPath = disallowPath.substring(0, commentIndex);
+ }
+ // Remove leading or trailing spaces from disallow path.
+ disallowPath = disallowPath.trim();
+ // Add disallow path to list.
+ disallowList.add(disallowPath);
+ }
+ }
+ // Add new disallow list to cache.
+ disallowListCache.put(host, disallowList);
+ }
+ catch (Exception e)
+ {
+ //Assume robot is allowed since an exception
+ //is thrown if the robot file doesn't exist.
+ return true;
+ }
+ }
+
+ //Loop through disallow list to see if
+ //crawling is allowed for the given URL.
+ String file = urlToCheck.getFile();
+ for (int i = 0; i < disallowList.size(); i++)
+ {
+ String disallow = (String) disallowList.get(i);
+ if (file.startsWith(disallow))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Fetches html for given url
+ * @param pageUrl
+ * @return
+ */
+ private String fetchPageContent(URL pageUrl)
+ {
+ try
+ {
+ // Open a connection to the URL and send a HEAD request
+ HttpURLConnection.setFollowRedirects(false);
+ HttpURLConnection con = (HttpURLConnection)pageUrl.openConnection();
+ con.setAllowUserInteraction(true);
+ con.setRequestMethod("HEAD");
+ con.setDoOutput(true);
+ con.setConnectTimeout(5000);
+ con.setReadTimeout(5000);
+ con.connect();
+
+ // Check if the page exists and if it is an HTML file
+ int code = con.getResponseCode();
+ String type = con.getContentType();
+
+ con.disconnect();
+
+ if (code != HttpURLConnection.HTTP_OK || !type.contains("text/html"))
+ {
+ return null;
+ }
+
+ // Open a connection to download the page content
+ InputStream pageStream = pageUrl.openStream();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(pageStream));
+
+ // Read the page line by line and write into the buffer
+ String line;
+ StringBuffer pageBuffer = new StringBuffer();
+
+ while ((line = reader.readLine()) != null)
+ {
+ pageBuffer.append(line);
+ }
+ pageStream.close();
+ reader.close();
+
+ // Return page content as a string
+ return pageBuffer.toString();
+
+ }
+
+ catch (Exception e)
+ {
+ return null;
+ }
+ }
+
+
+ private List extractLinks(URL pageUrl, String pageContent, HashSet crawledUrls)
+ {
+ // Create the regular expression for matching URLs
+ //Starts with ]", Pattern.CASE_INSENSITIVE);
+ Matcher matcher = pattern.matcher(pageContent);
+
+ // Create the list of extracted links
+ List linkList = new ArrayList();
+ int numLinksAdded =0;
+ while (matcher.find())
+ {
+ // Get the string inside the anchor href attribute
+ String link = Utils.getCanonicalURL(matcher.group(1).trim());
+
+ // Skip empty links
+ if (link == null || link.isEmpty())
+ {
+ continue;
+ }
+
+ // Skip links that are just page anchors
+ if (link.charAt(0) == '#')
+ {
+ continue;
+ }
+
+ // Skip mailto links
+ if (link.toLowerCase().contains("mailto:"))
+ {
+ continue;
+ }
+
+ // Skip JavaScript links
+ if (link.toLowerCase().contains("javascript:"))
+ {
+ continue;
+ }
+
+
+ // Construct absolute from relative URLs if necessary
+ if (!link.contains("://"))
+ {
+ if (link.charAt(0) == '/')
+ {
+ link = pageUrl.getProtocol() + "://" + pageUrl.getHost() + link;
+ }
+ else if (link.startsWith("../"))
+ {
+ try
+ {
+ URL absolute = new URL(pageUrl, link);
+ link = absolute.toString();
+ }
+ catch (MalformedURLException e)
+ {
+ link = "not valid";
+ }
+ }
+ else
+ {
+ String fileName = pageUrl.getFile();
+ String linkBase = pageUrl.getProtocol() + "://" + pageUrl.getHost();
+
+ if (!fileName.contains("/"))
+ {
+ link = linkBase + "/" + link;
+ }
+ else
+ {
+ String path = fileName.substring(0, fileName.lastIndexOf('/') + 1);
+ link = linkBase + path + link;
+ }
+ }
+ }
+
+ // If the link contains a named anchor, remove it
+ int index = link.indexOf('#');
+ if (index != -1)
+ {
+ link = link.substring(0, index);
+ }
+
+ //skip if it is the same as page url
+ if (Utils.removeTrailingSlash(link).toLowerCase().equals(pageUrl.toString().toLowerCase()))
+ {
+ continue;
+ }
+
+ // Verify the link and skip if invalid
+ URL checkedLink = Utils.checkUrl(link);
+ if (checkedLink == null)
+ {
+ continue;
+ }
+ //skip links outside domain if crawling in focused domain mode
+ if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN && !link.toLowerCase().contains(startUrl))
+ {
+ //System.out.println("(Thread " + threadID + ") Not crawling " + link + " (out of domain " + startUrl + ")");
+ continue;
+ }
+ // Skip the link if it has already been crawled
+ if (synchronizedManager.crawledUrlsSetContainsLink(link, crawledUrls))
+ {
+ continue;
+ }
+
+ // Add the link to the link list try to limit depth
+ if (numLinksAdded< MAX_DEPTH)
+ {
+ linkList.add(link);
+ numLinksAdded++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ // Return the list of links found on the page
+ return linkList;
+ }
+
+
+ public void crawl()
+ {
+ System.out.println("\n Thread " + threadID + " is starting crawling...\n");
+
+ long startTime = System.currentTimeMillis();
+
+ // Add the start URL to the list of URLs to crawl
+ synchronizedManager.addToUrlsToCrawlSet(startUrl, urlsToCrawl);
+
+ // Search until the number of found URLs reaches MAX_NUM_LINKS_TO_CRAWL or there are no more urls to crawl
+ while (synchronizedManager.keepCrawling(urlsToCrawl, crawledUrls, MAX_NUM_LINKS_TO_CRAWL) )
+ {
+ // Get the URL
+ String url = synchronizedManager.getFromUrlsToCrawlSet(urlsToCrawl, crawledUrls);
+
+ // Check and convert the URL string to the URL object
+ URL checkedUrl = Utils.checkUrl(url);
+
+ // Skip URL if robots are not allowed to access it.
+ if (checkedUrl != null && isRobotAllowed(checkedUrl))
+ {
+ // Download the page at the URL
+ String pageContent = fetchPageContent(checkedUrl);
+ if (pageContent != null && !pageContent.isEmpty())
+ {
+ // Extract valid links from the page
+ List links = extractLinks(checkedUrl, pageContent, crawledUrls);
+
+ // Add the links to the list of URLs to crawl
+ if(!links.isEmpty())
+ {
+ synchronizedManager.addAllToUrlsToCrawlSet(links, urlsToCrawl);
+ }
+
+// // Add the page to the list of crawled URLs
+// crawledUrls.add(url);
+
+ // Display the crawled URL
+ System.out.println("(Thread " + threadID + ") " + url);
+ }
+ }
+ }
+ if(synchronizedManager.crawledUrlsSetSize(crawledUrls)>0)
+ {
+ long endTime = System.currentTimeMillis();
+ DateFormat formatter = new SimpleDateFormat("mm:ss");
+ String totalTime = formatter.format(endTime - startTime);
+
+ System.out.println("\n (Thread " + threadID + ") Done. " + synchronizedManager.crawledUrlsSetSize(crawledUrls) + " URLs found. Total time: " + totalTime);
+ }
+ else
+ System.out.println("No valid URL could be found.");
+ }
+
+ @Override
+ public void run()
+ {
+ crawl();
+ }
+}
diff --git a/src/WebCrawlerMain.java b/src/WebCrawlerMain.java
new file mode 100644
index 0000000..b6faebb
--- /dev/null
+++ b/src/WebCrawlerMain.java
@@ -0,0 +1,114 @@
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+
+public class WebCrawlerMain
+{
+ private static ContentCrawlType crawlMode;
+ private static final int NUM_SEARCH_RESULTS = 5;//how many google search results to use in topical crawling
+
+ public static void main(String[] args) throws IOException
+ {
+ InputStreamReader converter = new InputStreamReader(System.in);
+ BufferedReader in = new BufferedReader(converter);
+ System.out.println("Enter the type of crawling: 1 (General Crawling), 2 (Focused domain Crawling) or 3 (Focused topic Crawling).");
+ crawlMode = ContentCrawlType.GENERAL_CRAWL;
+ while (true)
+ {
+ String mode = in.readLine();
+ if (!"1".equals(mode.trim()) && !"2".equals(mode.trim()) && !"3".equals(mode.trim()))
+ System.out.println("Invalid mode. Enter 1 (General), 2( Focused Domain) or 3 (Focused Topic)");
+ else
+ {
+ if ("2".equals(mode.trim()))
+ crawlMode = ContentCrawlType.FOCUSED_CRAWL_DOMAIN;
+ else if ("3".equals(mode.trim()))
+ crawlMode = ContentCrawlType.FOCUSED_CRAWL_TOPIC;
+ break;
+ }
+ }
+
+ List crawlThreads = new ArrayList<>();
+ if (crawlMode == ContentCrawlType.GENERAL_CRAWL || crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN)
+ {
+ String startUrl = null;
+ System.out.println("Enter the URL where to start crawling from:");
+ int startUrlNum = 0;
+ while (true)
+ {
+ startUrl = in.readLine();
+ if (!"q".equals(startUrl.trim()) && Utils.checkUrl(startUrl) != null)
+ {
+ startUrlNum++;
+ // create a thread for crawling
+ crawlThreads.add(new WebCrawlThread(crawlMode, startUrl, startUrlNum));
+ System.out.println("Another URL? (q to quit)");
+ }
+ else if ("q".equals(startUrl.trim()))
+ {
+ break;
+ }
+ else
+ System.out.println("The given URL is not valid. Please enter a valid URL or q to quit.");
+ }
+
+ // Start crawling
+ for (WebCrawlThread crawlThread : crawlThreads)
+ {
+ crawlThread.init();
+ }
+ }
+ else if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_TOPIC)//focused topic crawler
+ {
+ String search;
+ System.out.println("Enter your topic");
+
+ search = in.readLine();
+ //do google search on topic
+ String googleQuery = "http://www.google.com/search?q=";
+ String charset = "UTF-8";
+ //Chrome
+ String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36";
+ Elements links = Jsoup.connect(googleQuery + URLEncoder.encode(search, charset)).userAgent(userAgent).get().select("a[href]");
+ System.out.println("Crawling through top " + NUM_SEARCH_RESULTS + " Google search results for topic " + search);
+ int intCountResults = 0;
+ //crawl the first NUM_SEARCH_RESULTS links
+ for (Element link : links)
+ {
+ String title = link.text();
+ String startUrl = link.absUrl("href"); // Google returns URLs in format "http://www.google.com/url?q=&sa=U&ei=".
+
+ //url = URLDecoder.decode(url.substring(url.indexOf('=') + 1, url.indexOf('&')), "UTF-8");
+
+ if (!startUrl.startsWith("http") || startUrl.contains("google.com"))
+ {
+ continue; // Ads/news/etc.
+ }
+
+ System.out.println("Title: " + title);
+ System.out.println("URL: " + startUrl);
+
+ // Start crawling
+ if (Utils.checkUrl(startUrl) != null && intCountResults < NUM_SEARCH_RESULTS)
+ {
+ intCountResults++;
+ // Start a thread for crawling
+ WebCrawlThread crawlThread = new WebCrawlThread(crawlMode, startUrl,intCountResults );
+ crawlThread.init();
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ }
+}