diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..2b6a315 --- /dev/null +++ b/.classpath @@ -0,0 +1,7 @@ + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae3c172 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/bin/ diff --git a/.project b/.project new file mode 100644 index 0000000..a4afeb1 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + WebCrawler + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..838bd9d --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,11 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.7 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.7 diff --git a/external/lib/jsoup-1.7.3.jar b/external/lib/jsoup-1.7.3.jar new file mode 100644 index 0000000..aa5c798 Binary files /dev/null and b/external/lib/jsoup-1.7.3.jar differ diff --git a/readme.txt b/readme.txt new file mode 100644 index 0000000..676371b --- /dev/null +++ b/readme.txt @@ -0,0 +1,55 @@ +To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is +also set, run from the src folder + +javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java + +Then run from the same folder with + +java WebCrawlerMain + +Even better import the project into eclipse and run it from there. + +The crawling algorithm is breadth first search when single threaded + +If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links +found by any other thread. + +Adjustable params: + +In WebCrawlerMain.java + +private static final int NUM_SEARCH_RESULTS = 3; + +controls how many google search results to use to seed the topical crawling +(an equal number of threads will be spawned) + +In WebCrawlThread.java + +final int MAX_NUM_LINKS_TO_CRAWL = 1000; + +controls the maximum number of urls to crawl + +and + +final int MAX_DEPTH = 1000; + +how many links from each page to retrieve + +The data structures + +private static HashSet crawledUrls = new HashSet(); +private static LinkedHashSet urlsToCrawl = new LinkedHashSet(); + +in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions +among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl +(this set keeps insertion order), remove it and added it to the set crawledUrls. + +Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url. + + + + + + + + diff --git a/src/ContentCrawlType.java b/src/ContentCrawlType.java new file mode 100644 index 0000000..7d7c2f5 --- /dev/null +++ b/src/ContentCrawlType.java @@ -0,0 +1,6 @@ +public enum ContentCrawlType +{ + GENERAL_CRAWL, + FOCUSED_CRAWL_DOMAIN, + FOCUSED_CRAWL_TOPIC; +} diff --git a/src/SynchronizedManager.java b/src/SynchronizedManager.java new file mode 100644 index 0000000..bfe6d29 --- /dev/null +++ b/src/SynchronizedManager.java @@ -0,0 +1,43 @@ +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; + +public class SynchronizedManager +{ + synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet urlsToCrawl) + { + urlsToCrawl.add(url); + } + + synchronized void addAllToUrlsToCrawlSet(List links, LinkedHashSet urlsToCrawl) + { + urlsToCrawl.addAll(links); + } + + synchronized String getFromUrlsToCrawlSet(LinkedHashSet urlsToCrawl, HashSet crawledUrls) + { + String url = urlsToCrawl.iterator().next(); + //remove the page from the list of urls to crawl + urlsToCrawl.remove(url); + + // Add the page to the list of crawled URLs so that this page is not crawled again by a different thread + crawledUrls.add(url); + + return url; + } + + synchronized boolean keepCrawling(LinkedHashSet urlsToCrawl, HashSet crawledUrls, int maxLinksToCrawl) + { + return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl; + } + + synchronized boolean crawledUrlsSetContainsLink(String link, HashSet crawledUrls) + { + return crawledUrls.contains(link); + } + + synchronized int crawledUrlsSetSize(HashSet crawledUrls) + { + return crawledUrls.size(); + } +} diff --git a/src/UrlResolver.java b/src/UrlResolver.java new file mode 100644 index 0000000..a2dc5a1 --- /dev/null +++ b/src/UrlResolver.java @@ -0,0 +1,459 @@ +/** + * This class is adopted from Htmlunit with the following copyright: + * + * Copyright (c) 2002-2012 Gargoyle Software Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +public final class UrlResolver { + /** + * Resolves a given relative URL against a base URL. See + * RFC1808 + * Section 4 for more details. + * + * @param baseUrl The base URL in which to resolve the specification. + * @param relativeUrl The relative URL to resolve against the base URL. + * @return the resolved specification. + */ + public static String resolveUrl(final String baseUrl, final String relativeUrl) { + if (baseUrl == null) { + throw new IllegalArgumentException("Base URL must not be null"); + } + if (relativeUrl == null) { + throw new IllegalArgumentException("Relative URL must not be null"); + } + final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim()); + + return url.toString(); + } + + /** + * Returns the index within the specified string of the first occurrence of + * the specified search character. + * + * @param s the string to search + * @param searchChar the character to search for + * @param beginIndex the index at which to start the search + * @param endIndex the index at which to stop the search + * @return the index of the first occurrence of the character in the string or -1 + */ + private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) { + for (int i = beginIndex; i < endIndex; i++) { + if (s.charAt(i) == searchChar) { + return i; + } + } + return -1; + } + + /** + * Parses a given specification using the algorithm depicted in + * RFC1808: + * + * Section 2.4: Parsing a URL + * + * An accepted method for parsing URLs is useful to clarify the + * generic-RL syntax of Section 2.2 and to describe the algorithm for + * resolving relative URLs presented in Section 4. This section + * describes the parsing rules for breaking down a URL (relative or + * absolute) into the component parts described in Section 2.1. The + * rules assume that the URL has already been separated from any + * surrounding text and copied to a "parse string". The rules are + * listed in the order in which they would be applied by the parser. + * + * @param spec The specification to parse. + * @return the parsed specification. + */ + private static Url parseUrl(final String spec) { + final Url url = new Url(); + int startIndex = 0; + int endIndex = spec.length(); + + // Section 2.4.1: Parsing the Fragment Identifier + // + // If the parse string contains a crosshatch "#" character, then the + // substring after the first (left-most) crosshatch "#" and up to the + // end of the parse string is the identifier. If the + // crosshatch is the last character, or no crosshatch is present, then + // the fragment identifier is empty. The matched substring, including + // the crosshatch character, is removed from the parse string before + // continuing. + // + // Note that the fragment identifier is not considered part of the URL. + // However, since it is often attached to the URL, parsers must be able + // to recognize and set aside fragment identifiers as part of the + // process. + final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex); + + if (crosshatchIndex >= 0) { + url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex); + endIndex = crosshatchIndex; + } + // Section 2.4.2: Parsing the Scheme + // + // If the parse string contains a colon ":" after the first character + // and before any characters not allowed as part of a scheme name (i.e., + // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the + // of the URL is the substring of characters up to but not + // including the first colon. These characters and the colon are then + // removed from the parse string before continuing. + final int colonIndex = indexOf(spec, ':', startIndex, endIndex); + + if (colonIndex > 0) { + final String scheme = spec.substring(startIndex, colonIndex); + if (isValidScheme(scheme)) { + url.scheme_ = scheme; + startIndex = colonIndex + 1; + } + } + // Section 2.4.3: Parsing the Network Location/Login + // + // If the parse string begins with a double-slash "//", then the + // substring of characters after the double-slash and up to, but not + // including, the next slash "/" character is the network location/login + // () of the URL. If no trailing slash "/" is present, the + // entire remaining parse string is assigned to . The double- + // slash and are removed from the parse string before + // continuing. + // + // Note: We also accept a question mark "?" or a semicolon ";" character as + // delimiters for the network location/login () of the URL. + final int locationStartIndex; + int locationEndIndex; + + if (spec.startsWith("//", startIndex)) { + locationStartIndex = startIndex + 2; + locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex); + if (locationEndIndex >= 0) { + startIndex = locationEndIndex; + } + } + else { + locationStartIndex = -1; + locationEndIndex = -1; + } + // Section 2.4.4: Parsing the Query Information + // + // If the parse string contains a question mark "?" character, then the + // substring after the first (left-most) question mark "?" and up to the + // end of the parse string is the information. If the question + // mark is the last character, or no question mark is present, then the + // query information is empty. The matched substring, including the + // question mark character, is removed from the parse string before + // continuing. + final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex); + + if (questionMarkIndex >= 0) { + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The substring of characters after the double-slash and up to, but not + // including, the question mark "?" character is the network location/login + // () of the URL. + locationEndIndex = questionMarkIndex; + startIndex = questionMarkIndex; + } + url.query_ = spec.substring(questionMarkIndex + 1, endIndex); + endIndex = questionMarkIndex; + } + // Section 2.4.5: Parsing the Parameters + // + // If the parse string contains a semicolon ";" character, then the + // substring after the first (left-most) semicolon ";" and up to the end + // of the parse string is the parameters (). If the semicolon + // is the last character, or no semicolon is present, then is + // empty. The matched substring, including the semicolon character, is + // removed from the parse string before continuing. + final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex); + + if (semicolonIndex >= 0) { + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The substring of characters after the double-slash and up to, but not + // including, the semicolon ";" character is the network location/login + // () of the URL. + locationEndIndex = semicolonIndex; + startIndex = semicolonIndex; + } + url.parameters_ = spec.substring(semicolonIndex + 1, endIndex); + endIndex = semicolonIndex; + } + // Section 2.4.6: Parsing the Path + // + // After the above steps, all that is left of the parse string is the + // URL and the slash "/" that may precede it. Even though the + // initial slash is not part of the URL path, the parser must remember + // whether or not it was present so that later processes can + // differentiate between relative and absolute paths. Often this is + // done by simply storing the preceding slash along with the path. + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The entire remaining parse string is assigned to the network + // location/login () of the URL. + locationEndIndex = endIndex; + } + else if (startIndex < endIndex) { + url.path_ = spec.substring(startIndex, endIndex); + } + // Set the network location/login () of the URL. + if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) { + url.location_ = spec.substring(locationStartIndex, locationEndIndex); + } + return url; + } + + /* + * Returns true if specified string is a valid scheme name. + */ + private static boolean isValidScheme(final String scheme) { + final int length = scheme.length(); + if (length < 1) { + return false; + } + char c = scheme.charAt(0); + if (!Character.isLetter(c)) { + return false; + } + for (int i = 1; i < length; i++) { + c = scheme.charAt(i); + if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') { + return false; + } + } + return true; + } + + /** + * Resolves a given relative URL against a base URL using the algorithm + * depicted in RFC1808: + * + * Section 4: Resolving Relative URLs + * + * This section describes an example algorithm for resolving URLs within + * a context in which the URLs may be relative, such that the result is + * always a URL in absolute form. Although this algorithm cannot + * guarantee that the resulting URL will equal that intended by the + * original author, it does guarantee that any valid URL (relative or + * absolute) can be consistently transformed to an absolute form given a + * valid base URL. + * + * @param baseUrl The base URL in which to resolve the specification. + * @param relativeUrl The relative URL to resolve against the base URL. + * @return the resolved specification. + */ + private static Url resolveUrl(final Url baseUrl, final String relativeUrl) { + final Url url = parseUrl(relativeUrl); + // Step 1: The base URL is established according to the rules of + // Section 3. If the base URL is the empty string (unknown), + // the embedded URL is interpreted as an absolute URL and + // we are done. + if (baseUrl == null) { + return url; + } + // Step 2: Both the base and embedded URLs are parsed into their + // component parts as described in Section 2.4. + // a) If the embedded URL is entirely empty, it inherits the + // entire base URL (i.e., is set equal to the base URL) + // and we are done. + if (relativeUrl.length() == 0) { + return new Url(baseUrl); + } + // b) If the embedded URL starts with a scheme name, it is + // interpreted as an absolute URL and we are done. + if (url.scheme_ != null) { + return url; + } + // c) Otherwise, the embedded URL inherits the scheme of + // the base URL. + url.scheme_ = baseUrl.scheme_; + // Step 3: If the embedded URL's is non-empty, we skip to + // Step 7. Otherwise, the embedded URL inherits the + // (if any) of the base URL. + if (url.location_ != null) { + return url; + } + url.location_ = baseUrl.location_; + // Step 4: If the embedded URL path is preceded by a slash "/", the + // path is not relative and we skip to Step 7. + if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) { + url.path_ = removeLeadingSlashPoints(url.path_); + return url; + } + // Step 5: If the embedded URL path is empty (and not preceded by a + // slash), then the embedded URL inherits the base URL path, + // and + if (url.path_ == null) { + url.path_ = baseUrl.path_; + // a) if the embedded URL's is non-empty, we skip to + // step 7; otherwise, it inherits the of the base + // URL (if any) and + if (url.parameters_ != null) { + return url; + } + url.parameters_ = baseUrl.parameters_; + // b) if the embedded URL's is non-empty, we skip to + // step 7; otherwise, it inherits the of the base + // URL (if any) and we skip to step 7. + if (url.query_ != null) { + return url; + } + url.query_ = baseUrl.query_; + return url; + } + // Step 6: The last segment of the base URL's path (anything + // following the rightmost slash "/", or the entire path if no + // slash is present) is removed and the embedded URL's path is + // appended in its place. The following operations are + // then applied, in order, to the new path: + final String basePath = baseUrl.path_; + String path = ""; + + if (basePath != null) { + final int lastSlashIndex = basePath.lastIndexOf('/'); + + if (lastSlashIndex >= 0) { + path = basePath.substring(0, lastSlashIndex + 1); + } + } + else { + path = "/"; + } + path = path.concat(url.path_); + // a) All occurrences of "./", where "." is a complete path + // segment, are removed. + int pathSegmentIndex; + + while ((pathSegmentIndex = path.indexOf("/./")) >= 0) { + path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3)); + } + // b) If the path ends with "." as a complete path segment, + // that "." is removed. + if (path.endsWith("/.")) { + path = path.substring(0, path.length() - 1); + } + // c) All occurrences of "/../", where is a + // complete path segment not equal to "..", are removed. + // Removal of these path segments is performed iteratively, + // removing the leftmost matching pattern on each iteration, + // until no matching pattern remains. + while ((pathSegmentIndex = path.indexOf("/../")) > 0) { + final String pathSegment = path.substring(0, pathSegmentIndex); + final int slashIndex = pathSegment.lastIndexOf('/'); + + if (slashIndex < 0) { + continue; + } + if (!"..".equals(pathSegment.substring(slashIndex))) { + path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4)); + } + } + // d) If the path ends with "/..", where is a + // complete path segment not equal to "..", that + // "/.." is removed. + if (path.endsWith("/..")) { + final String pathSegment = path.substring(0, path.length() - 3); + final int slashIndex = pathSegment.lastIndexOf('/'); + + if (slashIndex >= 0) { + path = path.substring(0, slashIndex + 1); + } + } + + path = removeLeadingSlashPoints(path); + + url.path_ = path; + // Step 7: The resulting URL components, including any inherited from + // the base URL, are recombined to give the absolute form of + // the embedded URL. + return url; + } + + /** + * "/.." at the beginning should be removed as browsers do (not in RFC) + */ + private static String removeLeadingSlashPoints(String path) { + while (path.startsWith("/..")) { + path = path.substring(3); + } + + return path; + } + + /** + * Class Url represents a Uniform Resource Locator. + * + * @author Martin Tamme + */ + private static class Url { + + String scheme_; + String location_; + String path_; + String parameters_; + String query_; + String fragment_; + + /** + * Creates a Url object. + */ + public Url() { + } + + /** + * Creates a Url object from the specified + * Url object. + * + * @param url a Url object. + */ + public Url(final Url url) { + scheme_ = url.scheme_; + location_ = url.location_; + path_ = url.path_; + parameters_ = url.parameters_; + query_ = url.query_; + fragment_ = url.fragment_; + } + + /** + * Returns a string representation of the Url object. + * + * @return a string representation of the Url object. + */ + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + + if (scheme_ != null) { + sb.append(scheme_); + sb.append(':'); + } + if (location_ != null) { + sb.append("//"); + sb.append(location_); + } + if (path_ != null) { + sb.append(path_); + } + if (parameters_ != null) { + sb.append(';'); + sb.append(parameters_); + } + if (query_ != null) { + sb.append('?'); + sb.append(query_); + } + if (fragment_ != null) { + sb.append('#'); + sb.append(fragment_); + } + return sb.toString(); + } + } +} diff --git a/src/Utils.java b/src/Utils.java new file mode 100644 index 0000000..69bc82a --- /dev/null +++ b/src/Utils.java @@ -0,0 +1,250 @@ +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + + +public class Utils +{ + public static URL checkUrl(String url) + { + if (!url.toLowerCase().startsWith("http")) + { + return null; + } + + URL checkedUrl = null; + try + { + checkedUrl = new URL(url); + } + catch (MalformedURLException e) + { + return null; + } + + return checkedUrl; + } + + public static String removeTrailingSlash(String url) + { + if (url.endsWith("/")) + { + url = url.substring(0,url.length()-1); + } + + return url; + } + + + public static String getCanonicalURL(String url) + { + return getCanonicalURL(url, null); + } + + public static String getCanonicalURL(String href, String context) + { + try + { + URL canonicalURL = new URL(UrlResolver.resolveUrl(context == null ? "" : context, href)); + + String host = canonicalURL.getHost().toLowerCase(); + if (host == "") + { + // This is an invalid Url. + return null; + } + + String path = canonicalURL.getPath(); + + /* + * Normalize: no empty segments (i.e., "//"), no segments equal to + * ".", and no segments equal to ".." that are preceded by a segment + * not equal to "..". + */ + path = new URI(path.replace("\\", "/")).normalize().toString(); + + /* + * Convert '//' -> '/' + */ + int idx = path.indexOf("//"); + while (idx >= 0) + { + path = path.replace("//", "/"); + idx = path.indexOf("//"); + } + + /* + * Drop starting '/../' + */ + while (path.startsWith("/../")) + { + path = path.substring(3); + } + + /* + * Trim + */ + path = path.trim(); + + final SortedMap params = createParameterMap(canonicalURL.getQuery()); + final String queryString; + + if (params != null && params.size() > 0) + { + String canonicalParams = canonicalize(params); + queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams); + } + else + { + queryString = ""; + } + + /* + * Add starting slash if needed + */ + if (path.length() == 0) + { + path = "/" + path; + } + + /* + * Drop default port: example.com:80 -> example.com + */ + int port = canonicalURL.getPort(); + if (port == canonicalURL.getDefaultPort()) + { + port = -1; + } + + String protocol = canonicalURL.getProtocol().toLowerCase(); + String pathAndQueryString = normalizePath(path) + queryString; + + URL result = new URL(protocol, host, port, pathAndQueryString); + return result.toExternalForm(); + + } + catch (MalformedURLException ex) + { + return null; + } + catch (URISyntaxException ex) + { + return null; + } + } + + /** + * Takes a query string, separates the constituent name-value pairs, and + * stores them in a SortedMap ordered by lexicographical order. + * + * @return Null if there is no query string. + */ + private static SortedMap createParameterMap(final String queryString) + { + if (queryString == null || queryString.isEmpty()) + { + return null; + } + + final String[] pairs = queryString.split("&"); + final Map params = new HashMap<>(pairs.length); + + for (final String pair : pairs) + { + if (pair.length() == 0) + { + continue; + } + + String[] tokens = pair.split("=", 2); + switch (tokens.length) + { + case 1: + if (pair.charAt(0) == '=') + { + params.put("", tokens[0]); + } + else + { + params.put(tokens[0], ""); + } + break; + case 2: + params.put(tokens[0], tokens[1]); + break; + } + } + return new TreeMap<>(params); + } + + /** + * Canonicalize the query string. + * + * @param sortedParamMap + * Parameter name-value pairs in lexicographical order. + * @return Canonical form of query string. + */ + private static String canonicalize(final SortedMap sortedParamMap) + { + if (sortedParamMap == null || sortedParamMap.isEmpty()) + { + return ""; + } + + final StringBuffer sb = new StringBuffer(100); + for (Map.Entry pair : sortedParamMap.entrySet()) + { + final String key = pair.getKey().toLowerCase(); + if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid")) + { + continue; + } + if (sb.length() > 0) + { + sb.append('&'); + } + sb.append(percentEncodeRfc3986(pair.getKey())); + if (!pair.getValue().isEmpty()) + { + sb.append('='); + sb.append(percentEncodeRfc3986(pair.getValue())); + } + } + return sb.toString(); + } + /** + * Percent-encode values according the RFC 3986. The built-in Java + * URLEncoder does not encode according to the RFC, so we make the extra + * replacements. + * + * @param string + * Decoded string. + * @return Encoded string per RFC 3986. + */ + private static String percentEncodeRfc3986(String string) + { + try + { + string = string.replace("+", "%2B"); + string = URLDecoder.decode(string, "UTF-8"); + string = URLEncoder.encode(string, "UTF-8"); + return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~"); + } + catch (Exception e) + { + return string; + } + } + + private static String normalizePath(final String path) + { + return path.replace("%7E", "~").replace(" ", "%20"); + } +} diff --git a/src/WebCrawlThread.java b/src/WebCrawlThread.java new file mode 100644 index 0000000..b97f890 --- /dev/null +++ b/src/WebCrawlThread.java @@ -0,0 +1,366 @@ +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Thread crawls from a start url + * @author zona + * + */ +public class WebCrawlThread extends Thread +{ + //absolute maximum number of urls to crawl + final int MAX_NUM_LINKS_TO_CRAWL = 1000; + final int MAX_DEPTH = 1000; + private ContentCrawlType crawlMode; + private String startUrl; + private int threadID; + private SynchronizedManager synchronizedManager; + + //cache with disallowed urls as per the robots.txt file. The key is the host url + //i.e. www.cnn.com, and the values are all urls disallowed for www.cnn.com + private static Map> disallowListCache = new HashMap<>(); + + // Crawl lists + private static HashSet crawledUrls = new HashSet(); + private static LinkedHashSet urlsToCrawl = new LinkedHashSet(); + + public WebCrawlThread(ContentCrawlType aCrawlMode,String aStartUrl, int aThreadID) + { + this.crawlMode = aCrawlMode; + this.startUrl = aStartUrl; + this.threadID = aThreadID; + this.synchronizedManager = new SynchronizedManager(); + } + + public void init() + { + this.start(); + } + + + // Check if robot is allowed to access the given URL. + //Different threads can be accessing the map so make it synchronized to prevent race conditions + //But it makes it a bottleneck + /** + * + * @param urlToCheck + * @return + */ + private synchronized boolean isRobotAllowed(URL urlToCheck) + { + String host = urlToCheck.getHost().toLowerCase(); + + // Retrieve host's disallow list from cache. + List disallowList = disallowListCache.get(host); + + // If list is not in the cache, download and cache it. + if (disallowList == null) + { + disallowList = new ArrayList(); + try + { + URL robotsFileUrl = new URL("http://" + host + "/robots.txt"); + + // Open connection to robot file URL for reading. + BufferedReader reader = new BufferedReader(new InputStreamReader(robotsFileUrl.openStream())); + + // Read robot file, creating list of disallowed paths. + String line; + while ((line = reader.readLine()) != null) + { + if (line.toLowerCase().indexOf("Disallow:") == 0) + { + String disallowPath = line.toLowerCase().substring("Disallow:".length()); + // Check disallow path for comments and remove if present. + int commentIndex = disallowPath.indexOf("#"); + if (commentIndex != - 1) + { + disallowPath = disallowPath.substring(0, commentIndex); + } + // Remove leading or trailing spaces from disallow path. + disallowPath = disallowPath.trim(); + // Add disallow path to list. + disallowList.add(disallowPath); + } + } + // Add new disallow list to cache. + disallowListCache.put(host, disallowList); + } + catch (Exception e) + { + //Assume robot is allowed since an exception + //is thrown if the robot file doesn't exist. + return true; + } + } + + //Loop through disallow list to see if + //crawling is allowed for the given URL. + String file = urlToCheck.getFile(); + for (int i = 0; i < disallowList.size(); i++) + { + String disallow = (String) disallowList.get(i); + if (file.startsWith(disallow)) + { + return false; + } + } + return true; + } + + /** + * Fetches html for given url + * @param pageUrl + * @return + */ + private String fetchPageContent(URL pageUrl) + { + try + { + // Open a connection to the URL and send a HEAD request + HttpURLConnection.setFollowRedirects(false); + HttpURLConnection con = (HttpURLConnection)pageUrl.openConnection(); + con.setAllowUserInteraction(true); + con.setRequestMethod("HEAD"); + con.setDoOutput(true); + con.setConnectTimeout(5000); + con.setReadTimeout(5000); + con.connect(); + + // Check if the page exists and if it is an HTML file + int code = con.getResponseCode(); + String type = con.getContentType(); + + con.disconnect(); + + if (code != HttpURLConnection.HTTP_OK || !type.contains("text/html")) + { + return null; + } + + // Open a connection to download the page content + InputStream pageStream = pageUrl.openStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(pageStream)); + + // Read the page line by line and write into the buffer + String line; + StringBuffer pageBuffer = new StringBuffer(); + + while ((line = reader.readLine()) != null) + { + pageBuffer.append(line); + } + pageStream.close(); + reader.close(); + + // Return page content as a string + return pageBuffer.toString(); + + } + + catch (Exception e) + { + return null; + } + } + + + private List extractLinks(URL pageUrl, String pageContent, HashSet crawledUrls) + { + // Create the regular expression for matching URLs + //Starts with ]", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(pageContent); + + // Create the list of extracted links + List linkList = new ArrayList(); + int numLinksAdded =0; + while (matcher.find()) + { + // Get the string inside the anchor href attribute + String link = Utils.getCanonicalURL(matcher.group(1).trim()); + + // Skip empty links + if (link == null || link.isEmpty()) + { + continue; + } + + // Skip links that are just page anchors + if (link.charAt(0) == '#') + { + continue; + } + + // Skip mailto links + if (link.toLowerCase().contains("mailto:")) + { + continue; + } + + // Skip JavaScript links + if (link.toLowerCase().contains("javascript:")) + { + continue; + } + + + // Construct absolute from relative URLs if necessary + if (!link.contains("://")) + { + if (link.charAt(0) == '/') + { + link = pageUrl.getProtocol() + "://" + pageUrl.getHost() + link; + } + else if (link.startsWith("../")) + { + try + { + URL absolute = new URL(pageUrl, link); + link = absolute.toString(); + } + catch (MalformedURLException e) + { + link = "not valid"; + } + } + else + { + String fileName = pageUrl.getFile(); + String linkBase = pageUrl.getProtocol() + "://" + pageUrl.getHost(); + + if (!fileName.contains("/")) + { + link = linkBase + "/" + link; + } + else + { + String path = fileName.substring(0, fileName.lastIndexOf('/') + 1); + link = linkBase + path + link; + } + } + } + + // If the link contains a named anchor, remove it + int index = link.indexOf('#'); + if (index != -1) + { + link = link.substring(0, index); + } + + //skip if it is the same as page url + if (Utils.removeTrailingSlash(link).toLowerCase().equals(pageUrl.toString().toLowerCase())) + { + continue; + } + + // Verify the link and skip if invalid + URL checkedLink = Utils.checkUrl(link); + if (checkedLink == null) + { + continue; + } + //skip links outside domain if crawling in focused domain mode + if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN && !link.toLowerCase().contains(startUrl)) + { + //System.out.println("(Thread " + threadID + ") Not crawling " + link + " (out of domain " + startUrl + ")"); + continue; + } + // Skip the link if it has already been crawled + if (synchronizedManager.crawledUrlsSetContainsLink(link, crawledUrls)) + { + continue; + } + + // Add the link to the link list try to limit depth + if (numLinksAdded< MAX_DEPTH) + { + linkList.add(link); + numLinksAdded++; + } + else + { + break; + } + } + // Return the list of links found on the page + return linkList; + } + + + public void crawl() + { + System.out.println("\n Thread " + threadID + " is starting crawling...\n"); + + long startTime = System.currentTimeMillis(); + + // Add the start URL to the list of URLs to crawl + synchronizedManager.addToUrlsToCrawlSet(startUrl, urlsToCrawl); + + // Search until the number of found URLs reaches MAX_NUM_LINKS_TO_CRAWL or there are no more urls to crawl + while (synchronizedManager.keepCrawling(urlsToCrawl, crawledUrls, MAX_NUM_LINKS_TO_CRAWL) ) + { + // Get the URL + String url = synchronizedManager.getFromUrlsToCrawlSet(urlsToCrawl, crawledUrls); + + // Check and convert the URL string to the URL object + URL checkedUrl = Utils.checkUrl(url); + + // Skip URL if robots are not allowed to access it. + if (checkedUrl != null && isRobotAllowed(checkedUrl)) + { + // Download the page at the URL + String pageContent = fetchPageContent(checkedUrl); + if (pageContent != null && !pageContent.isEmpty()) + { + // Extract valid links from the page + List links = extractLinks(checkedUrl, pageContent, crawledUrls); + + // Add the links to the list of URLs to crawl + if(!links.isEmpty()) + { + synchronizedManager.addAllToUrlsToCrawlSet(links, urlsToCrawl); + } + +// // Add the page to the list of crawled URLs +// crawledUrls.add(url); + + // Display the crawled URL + System.out.println("(Thread " + threadID + ") " + url); + } + } + } + if(synchronizedManager.crawledUrlsSetSize(crawledUrls)>0) + { + long endTime = System.currentTimeMillis(); + DateFormat formatter = new SimpleDateFormat("mm:ss"); + String totalTime = formatter.format(endTime - startTime); + + System.out.println("\n (Thread " + threadID + ") Done. " + synchronizedManager.crawledUrlsSetSize(crawledUrls) + " URLs found. Total time: " + totalTime); + } + else + System.out.println("No valid URL could be found."); + } + + @Override + public void run() + { + crawl(); + } +} diff --git a/src/WebCrawlerMain.java b/src/WebCrawlerMain.java new file mode 100644 index 0000000..b6faebb --- /dev/null +++ b/src/WebCrawlerMain.java @@ -0,0 +1,114 @@ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + + +public class WebCrawlerMain +{ + private static ContentCrawlType crawlMode; + private static final int NUM_SEARCH_RESULTS = 5;//how many google search results to use in topical crawling + + public static void main(String[] args) throws IOException + { + InputStreamReader converter = new InputStreamReader(System.in); + BufferedReader in = new BufferedReader(converter); + System.out.println("Enter the type of crawling: 1 (General Crawling), 2 (Focused domain Crawling) or 3 (Focused topic Crawling)."); + crawlMode = ContentCrawlType.GENERAL_CRAWL; + while (true) + { + String mode = in.readLine(); + if (!"1".equals(mode.trim()) && !"2".equals(mode.trim()) && !"3".equals(mode.trim())) + System.out.println("Invalid mode. Enter 1 (General), 2( Focused Domain) or 3 (Focused Topic)"); + else + { + if ("2".equals(mode.trim())) + crawlMode = ContentCrawlType.FOCUSED_CRAWL_DOMAIN; + else if ("3".equals(mode.trim())) + crawlMode = ContentCrawlType.FOCUSED_CRAWL_TOPIC; + break; + } + } + + List crawlThreads = new ArrayList<>(); + if (crawlMode == ContentCrawlType.GENERAL_CRAWL || crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN) + { + String startUrl = null; + System.out.println("Enter the URL where to start crawling from:"); + int startUrlNum = 0; + while (true) + { + startUrl = in.readLine(); + if (!"q".equals(startUrl.trim()) && Utils.checkUrl(startUrl) != null) + { + startUrlNum++; + // create a thread for crawling + crawlThreads.add(new WebCrawlThread(crawlMode, startUrl, startUrlNum)); + System.out.println("Another URL? (q to quit)"); + } + else if ("q".equals(startUrl.trim())) + { + break; + } + else + System.out.println("The given URL is not valid. Please enter a valid URL or q to quit."); + } + + // Start crawling + for (WebCrawlThread crawlThread : crawlThreads) + { + crawlThread.init(); + } + } + else if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_TOPIC)//focused topic crawler + { + String search; + System.out.println("Enter your topic"); + + search = in.readLine(); + //do google search on topic + String googleQuery = "http://www.google.com/search?q="; + String charset = "UTF-8"; + //Chrome + String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36"; + Elements links = Jsoup.connect(googleQuery + URLEncoder.encode(search, charset)).userAgent(userAgent).get().select("a[href]"); + System.out.println("Crawling through top " + NUM_SEARCH_RESULTS + " Google search results for topic " + search); + int intCountResults = 0; + //crawl the first NUM_SEARCH_RESULTS links + for (Element link : links) + { + String title = link.text(); + String startUrl = link.absUrl("href"); // Google returns URLs in format "http://www.google.com/url?q=&sa=U&ei=". + + //url = URLDecoder.decode(url.substring(url.indexOf('=') + 1, url.indexOf('&')), "UTF-8"); + + if (!startUrl.startsWith("http") || startUrl.contains("google.com")) + { + continue; // Ads/news/etc. + } + + System.out.println("Title: " + title); + System.out.println("URL: " + startUrl); + + // Start crawling + if (Utils.checkUrl(startUrl) != null && intCountResults < NUM_SEARCH_RESULTS) + { + intCountResults++; + // Start a thread for crawling + WebCrawlThread crawlThread = new WebCrawlThread(crawlMode, startUrl,intCountResults ); + crawlThread.init(); + } + else + { + break; + } + } + } + } +}