diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..2b6a315
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
+	<classpathentry kind="lib" path="C:/EclipseLuna/BigData/WebCrawler/external/lib/jsoup-1.7.3.jar"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ae3c172
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/bin/
diff --git a/.project b/.project
new file mode 100644
index 0000000..a4afeb1
--- /dev/null
+++ b/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>WebCrawler</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..838bd9d
--- /dev/null
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/external/lib/jsoup-1.7.3.jar b/external/lib/jsoup-1.7.3.jar
new file mode 100644
index 0000000..aa5c798
Binary files /dev/null and b/external/lib/jsoup-1.7.3.jar differ
diff --git a/readme.txt b/readme.txt
new file mode 100644
index 0000000..676371b
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,55 @@
+To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is
+also set, run from the src folder
+
+javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java
+
+Then run from the same folder with
+
+java WebCrawlerMain
+
+Even better import the project into eclipse and run it from there.
+
+The crawling algorithm is breadth first search when single threaded
+
+If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links
+found by any other thread.
+
+Adjustable params:
+
+In WebCrawlerMain.java
+
+private static final int NUM_SEARCH_RESULTS = 3;
+
+controls how many google search results to use to seed the topical crawling
+(an equal number of threads will be spawned)
+
+In WebCrawlThread.java
+
+final int MAX_NUM_LINKS_TO_CRAWL = 1000;
+
+controls the maximum number of urls to crawl 
+
+and
+
+final int MAX_DEPTH = 1000;
+
+how many links from each page to retrieve
+
+The data structures 
+
+private static HashSet<String> crawledUrls = new HashSet<String>();
+private static LinkedHashSet<String> urlsToCrawl = new LinkedHashSet<String>();
+
+in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions
+among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl
+(this set keeps insertion order), remove it and added it to the set crawledUrls.
+
+Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url.
+
+
+
+
+
+
+	
+
diff --git a/src/ContentCrawlType.java b/src/ContentCrawlType.java
new file mode 100644
index 0000000..7d7c2f5
--- /dev/null
+++ b/src/ContentCrawlType.java
@@ -0,0 +1,6 @@
+public enum ContentCrawlType 
+{
+	GENERAL_CRAWL, 
+	FOCUSED_CRAWL_DOMAIN,
+	FOCUSED_CRAWL_TOPIC;		
+}
diff --git a/src/SynchronizedManager.java b/src/SynchronizedManager.java
new file mode 100644
index 0000000..bfe6d29
--- /dev/null
+++ b/src/SynchronizedManager.java
@@ -0,0 +1,43 @@
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+
+public class SynchronizedManager 
+{
+	synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet<String> urlsToCrawl)
+	{
+		urlsToCrawl.add(url);
+	}
+	
+	synchronized void addAllToUrlsToCrawlSet(List<String> links, LinkedHashSet<String> urlsToCrawl)
+	{
+		urlsToCrawl.addAll(links);
+	}	
+	
+	synchronized String getFromUrlsToCrawlSet(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls)
+	{
+		String url =  urlsToCrawl.iterator().next();
+		//remove the page from the list of urls to crawl
+		urlsToCrawl.remove(url);
+		
+		// Add the page to the list of crawled URLs so that this page is not crawled again by a different thread
+		crawledUrls.add(url);
+		
+		return url;
+	}
+	
+	synchronized boolean keepCrawling(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls, int maxLinksToCrawl)
+	{
+		return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl;
+	}
+	
+	synchronized boolean crawledUrlsSetContainsLink(String link, HashSet<String> crawledUrls)
+	{
+		return crawledUrls.contains(link);
+	}
+	
+	synchronized int crawledUrlsSetSize(HashSet<String> crawledUrls)
+	{
+		return crawledUrls.size();
+	}
+}
diff --git a/src/UrlResolver.java b/src/UrlResolver.java
new file mode 100644
index 0000000..a2dc5a1
--- /dev/null
+++ b/src/UrlResolver.java
@@ -0,0 +1,459 @@
+/**
+ * This class is adopted from Htmlunit with the following copyright:
+ * 
+ * Copyright (c) 2002-2012 Gargoyle Software Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+public final class UrlResolver {
+  /**
+   * Resolves a given relative URL against a base URL. See
+   * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
+   * Section 4 for more details.
+   *
+   * @param baseUrl     The base URL in which to resolve the specification.
+   * @param relativeUrl The relative URL to resolve against the base URL.
+   * @return the resolved specification.
+   */
+  public static String resolveUrl(final String baseUrl, final String relativeUrl) {
+    if (baseUrl == null) {
+      throw new IllegalArgumentException("Base URL must not be null");
+    }
+    if (relativeUrl == null) {
+      throw new IllegalArgumentException("Relative URL must not be null");
+    }
+    final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
+
+    return url.toString();
+  }
+
+  /**
+   * Returns the index within the specified string of the first occurrence of
+   * the specified search character.
+   *
+   * @param s the string to search
+   * @param searchChar the character to search for
+   * @param beginIndex the index at which to start the search
+   * @param endIndex the index at which to stop the search
+   * @return the index of the first occurrence of the character in the string or <tt>-1</tt>
+   */
+  private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) {
+    for (int i = beginIndex; i < endIndex; i++) {
+      if (s.charAt(i) == searchChar) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Parses a given specification using the algorithm depicted in
+   * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
+   *
+   * Section 2.4: Parsing a URL
+   *
+   *   An accepted method for parsing URLs is useful to clarify the
+   *   generic-RL syntax of Section 2.2 and to describe the algorithm for
+   *   resolving relative URLs presented in Section 4. This section
+   *   describes the parsing rules for breaking down a URL (relative or
+   *   absolute) into the component parts described in Section 2.1.  The
+   *   rules assume that the URL has already been separated from any
+   *   surrounding text and copied to a "parse string". The rules are
+   *   listed in the order in which they would be applied by the parser.
+   *
+   * @param spec The specification to parse.
+   * @return the parsed specification.
+   */
+  private static Url parseUrl(final String spec) {
+    final Url url = new Url();
+    int startIndex = 0;
+    int endIndex = spec.length();
+
+    // Section 2.4.1: Parsing the Fragment Identifier
+    //
+    //   If the parse string contains a crosshatch "#" character, then the
+    //   substring after the first (left-most) crosshatch "#" and up to the
+    //   end of the parse string is the <fragment> identifier. If the
+    //   crosshatch is the last character, or no crosshatch is present, then
+    //   the fragment identifier is empty. The matched substring, including
+    //   the crosshatch character, is removed from the parse string before
+    //   continuing.
+    //
+    //   Note that the fragment identifier is not considered part of the URL.
+    //   However, since it is often attached to the URL, parsers must be able
+    //   to recognize and set aside fragment identifiers as part of the
+    //   process.
+    final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex);
+
+    if (crosshatchIndex >= 0) {
+      url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
+      endIndex = crosshatchIndex;
+    }
+    // Section 2.4.2: Parsing the Scheme
+    //
+    //   If the parse string contains a colon ":" after the first character
+    //   and before any characters not allowed as part of a scheme name (i.e.,
+    //   any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
+    //   <scheme> of the URL is the substring of characters up to but not
+    //   including the first colon. These characters and the colon are then
+    //   removed from the parse string before continuing.
+    final int colonIndex = indexOf(spec, ':', startIndex, endIndex);
+
+    if (colonIndex > 0) {
+      final String scheme = spec.substring(startIndex, colonIndex);
+      if (isValidScheme(scheme)) {
+        url.scheme_ = scheme;
+        startIndex = colonIndex + 1;
+      }
+    }
+    // Section 2.4.3: Parsing the Network Location/Login
+    //
+    //   If the parse string begins with a double-slash "//", then the
+    //   substring of characters after the double-slash and up to, but not
+    //   including, the next slash "/" character is the network location/login
+    //   (<net_loc>) of the URL. If no trailing slash "/" is present, the
+    //   entire remaining parse string is assigned to <net_loc>. The double-
+    //   slash and <net_loc> are removed from the parse string before
+    //   continuing.
+    //
+    // Note: We also accept a question mark "?" or a semicolon ";" character as
+    //       delimiters for the network location/login (<net_loc>) of the URL.
+    final int locationStartIndex;
+    int locationEndIndex;
+
+    if (spec.startsWith("//", startIndex)) {
+      locationStartIndex = startIndex + 2;
+      locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex);
+      if (locationEndIndex >= 0) {
+        startIndex = locationEndIndex;
+      }
+    }
+    else {
+      locationStartIndex = -1;
+      locationEndIndex = -1;
+    }
+    // Section 2.4.4: Parsing the Query Information
+    //
+    //   If the parse string contains a question mark "?" character, then the
+    //   substring after the first (left-most) question mark "?" and up to the
+    //   end of the parse string is the <query> information. If the question
+    //   mark is the last character, or no question mark is present, then the
+    //   query information is empty. The matched substring, including the
+    //   question mark character, is removed from the parse string before
+    //   continuing.
+    final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex);
+
+    if (questionMarkIndex >= 0) {
+      if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+        // The substring of characters after the double-slash and up to, but not
+        // including, the question mark "?" character is the network location/login
+        // (<net_loc>) of the URL.
+        locationEndIndex = questionMarkIndex;
+        startIndex = questionMarkIndex;
+      }
+      url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
+      endIndex = questionMarkIndex;
+    }
+    // Section 2.4.5: Parsing the Parameters
+    //
+    //   If the parse string contains a semicolon ";" character, then the
+    //   substring after the first (left-most) semicolon ";" and up to the end
+    //   of the parse string is the parameters (<params>). If the semicolon
+    //   is the last character, or no semicolon is present, then <params> is
+    //   empty. The matched substring, including the semicolon character, is
+    //   removed from the parse string before continuing.
+    final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex);
+
+    if (semicolonIndex >= 0) {
+      if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+        // The substring of characters after the double-slash and up to, but not
+        // including, the semicolon ";" character is the network location/login
+        // (<net_loc>) of the URL.
+        locationEndIndex = semicolonIndex;
+        startIndex = semicolonIndex;
+      }
+      url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
+      endIndex = semicolonIndex;
+    }
+    // Section 2.4.6: Parsing the Path
+    //
+    //   After the above steps, all that is left of the parse string is the
+    //   URL <path> and the slash "/" that may precede it. Even though the
+    //   initial slash is not part of the URL path, the parser must remember
+    //   whether or not it was present so that later processes can
+    //   differentiate between relative and absolute paths. Often this is
+    //   done by simply storing the preceding slash along with the path.
+    if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
+      // The entire remaining parse string is assigned to the network
+      // location/login (<net_loc>) of the URL.
+      locationEndIndex = endIndex;
+    }
+    else if (startIndex < endIndex) {
+      url.path_ = spec.substring(startIndex, endIndex);
+    }
+    // Set the network location/login (<net_loc>) of the URL.
+    if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
+      url.location_ = spec.substring(locationStartIndex, locationEndIndex);
+    }
+    return url;
+  }
+
+  /*
+   * Returns true if specified string is a valid scheme name.
+   */
+  private static boolean isValidScheme(final String scheme) {
+    final int length = scheme.length();
+    if (length < 1) {
+      return false;
+    }
+    char c = scheme.charAt(0);
+    if (!Character.isLetter(c)) {
+      return false;
+    }
+    for (int i = 1; i < length; i++) {
+      c = scheme.charAt(i);
+      if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Resolves a given relative URL against a base URL using the algorithm
+   * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
+   *
+   * Section 4: Resolving Relative URLs
+   *
+   *   This section describes an example algorithm for resolving URLs within
+   *   a context in which the URLs may be relative, such that the result is
+   *   always a URL in absolute form. Although this algorithm cannot
+   *   guarantee that the resulting URL will equal that intended by the
+   *   original author, it does guarantee that any valid URL (relative or
+   *   absolute) can be consistently transformed to an absolute form given a
+   *   valid base URL.
+   *
+   * @param baseUrl     The base URL in which to resolve the specification.
+   * @param relativeUrl The relative URL to resolve against the base URL.
+   * @return the resolved specification.
+   */
+  private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
+    final Url url = parseUrl(relativeUrl);
+    // Step 1: The base URL is established according to the rules of
+    //         Section 3.  If the base URL is the empty string (unknown),
+    //         the embedded URL is interpreted as an absolute URL and
+    //         we are done.
+    if (baseUrl == null) {
+      return url;
+    }
+    // Step 2: Both the base and embedded URLs are parsed into their
+    //         component parts as described in Section 2.4.
+    //      a) If the embedded URL is entirely empty, it inherits the
+    //         entire base URL (i.e., is set equal to the base URL)
+    //         and we are done.
+    if (relativeUrl.length() == 0) {
+      return new Url(baseUrl);
+    }
+    //      b) If the embedded URL starts with a scheme name, it is
+    //         interpreted as an absolute URL and we are done.
+    if (url.scheme_ != null) {
+      return url;
+    }
+    //      c) Otherwise, the embedded URL inherits the scheme of
+    //         the base URL.
+    url.scheme_ = baseUrl.scheme_;
+    // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
+    //         Step 7.  Otherwise, the embedded URL inherits the <net_loc>
+    //         (if any) of the base URL.
+    if (url.location_ != null) {
+      return url;
+    }
+    url.location_ = baseUrl.location_;
+    // Step 4: If the embedded URL path is preceded by a slash "/", the
+    //         path is not relative and we skip to Step 7.
+    if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) {
+      url.path_ = removeLeadingSlashPoints(url.path_);
+      return url;
+    }
+    // Step 5: If the embedded URL path is empty (and not preceded by a
+    //         slash), then the embedded URL inherits the base URL path,
+    //         and
+    if (url.path_ == null) {
+      url.path_ = baseUrl.path_;
+      //  a) if the embedded URL's <params> is non-empty, we skip to
+      //     step 7; otherwise, it inherits the <params> of the base
+      //     URL (if any) and
+      if (url.parameters_ != null) {
+        return url;
+      }
+      url.parameters_ = baseUrl.parameters_;
+      //  b) if the embedded URL's <query> is non-empty, we skip to
+      //     step 7; otherwise, it inherits the <query> of the base
+      //     URL (if any) and we skip to step 7.
+      if (url.query_ != null) {
+        return url;
+      }
+      url.query_ = baseUrl.query_;
+      return url;
+    }
+    // Step 6: The last segment of the base URL's path (anything
+    //         following the rightmost slash "/", or the entire path if no
+    //         slash is present) is removed and the embedded URL's path is
+    //         appended in its place.  The following operations are
+    //         then applied, in order, to the new path:
+    final String basePath = baseUrl.path_;
+    String path = "";
+
+    if (basePath != null) {
+      final int lastSlashIndex = basePath.lastIndexOf('/');
+
+      if (lastSlashIndex >= 0) {
+          path = basePath.substring(0, lastSlashIndex + 1);
+      }
+   }
+    else {
+      path = "/";
+    }
+    path = path.concat(url.path_);
+    //      a) All occurrences of "./", where "." is a complete path
+    //         segment, are removed.
+    int pathSegmentIndex;
+
+    while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
+      path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
+    }
+    //      b) If the path ends with "." as a complete path segment,
+    //         that "." is removed.
+    if (path.endsWith("/.")) {
+      path = path.substring(0, path.length() - 1);
+    }
+    //      c) All occurrences of "<segment>/../", where <segment> is a
+    //         complete path segment not equal to "..", are removed.
+    //         Removal of these path segments is performed iteratively,
+    //         removing the leftmost matching pattern on each iteration,
+    //         until no matching pattern remains.
+    while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
+      final String pathSegment = path.substring(0, pathSegmentIndex);
+      final int slashIndex = pathSegment.lastIndexOf('/');
+
+      if (slashIndex < 0) {
+        continue;
+      }
+      if (!"..".equals(pathSegment.substring(slashIndex))) {
+        path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
+      }
+    }
+    //      d) If the path ends with "<segment>/..", where <segment> is a
+    //         complete path segment not equal to "..", that
+    //         "<segment>/.." is removed.
+    if (path.endsWith("/..")) {
+      final String pathSegment = path.substring(0, path.length() - 3);
+      final int slashIndex = pathSegment.lastIndexOf('/');
+
+      if (slashIndex >= 0) {
+        path = path.substring(0, slashIndex + 1);
+      }
+    }
+
+    path = removeLeadingSlashPoints(path);
+
+    url.path_ = path;
+    // Step 7: The resulting URL components, including any inherited from
+    //         the base URL, are recombined to give the absolute form of
+    //         the embedded URL.
+    return url;
+  }
+
+  /**
+   * "/.." at the beginning should be removed as browsers do (not in RFC)
+   */
+  private static String removeLeadingSlashPoints(String path) {
+    while (path.startsWith("/..")) {
+      path = path.substring(3);
+    }
+
+    return path;
+  }
+
+  /**
+   * Class <tt>Url</tt> represents a Uniform Resource Locator.
+   *
+   * @author Martin Tamme
+   */
+  private static class Url {
+
+    String scheme_;
+    String location_;
+    String path_;
+    String parameters_;
+    String query_;
+    String fragment_;
+
+    /**
+     * Creates a <tt>Url</tt> object.
+     */
+    public Url() {
+    }
+
+    /**
+     * Creates a <tt>Url</tt> object from the specified
+     * <tt>Url</tt> object.
+     *
+     * @param url a <tt>Url</tt> object.
+     */
+    public Url(final Url url) {
+      scheme_ = url.scheme_;
+      location_ = url.location_;
+      path_ = url.path_;
+      parameters_ = url.parameters_;
+      query_ = url.query_;
+      fragment_ = url.fragment_;
+    }
+
+    /**
+     * Returns a string representation of the <tt>Url</tt> object.
+     *
+     * @return a string representation of the <tt>Url</tt> object.
+     */
+    @Override
+    public String toString() {
+      final StringBuilder sb = new StringBuilder();
+
+      if (scheme_ != null) {
+        sb.append(scheme_);
+        sb.append(':');
+      }
+      if (location_ != null) {
+        sb.append("//");
+        sb.append(location_);
+      }
+      if (path_ != null) {
+        sb.append(path_);
+      }
+      if (parameters_ != null) {
+        sb.append(';');
+        sb.append(parameters_);
+      }
+      if (query_ != null) {
+        sb.append('?');
+        sb.append(query_);
+      }
+      if (fragment_ != null) {
+        sb.append('#');
+        sb.append(fragment_);
+      }
+      return sb.toString();
+    }
+  }
+}
diff --git a/src/Utils.java b/src/Utils.java
new file mode 100644
index 0000000..69bc82a
--- /dev/null
+++ b/src/Utils.java
@@ -0,0 +1,250 @@
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+
+public class Utils 
+{
+	public static URL checkUrl(String url) 
+	{		
+		if (!url.toLowerCase().startsWith("http")) 
+		{
+			return null;
+		}
+		
+		URL checkedUrl = null;
+		try 
+		{
+			checkedUrl = new URL(url);
+		} 
+		catch (MalformedURLException e) 
+		{
+			return null;
+		}
+		
+		return checkedUrl;
+	}
+	
+	public static String removeTrailingSlash(String url)
+	{
+		if (url.endsWith("/"))
+		{
+			url = url.substring(0,url.length()-1);
+		}
+		
+		return url;
+	}
+	
+	
+	public static String getCanonicalURL(String url) 
+	{
+	    return getCanonicalURL(url, null);
+	}
+
+	public static String getCanonicalURL(String href, String context) 
+	{
+	    try 
+	    {
+	    	URL canonicalURL = new URL(UrlResolver.resolveUrl(context == null ? "" : context, href));
+
+	    	String host = canonicalURL.getHost().toLowerCase();
+	    	if (host == "") 
+	    	{
+	    		// This is an invalid Url.
+	    		return null;
+	    	}
+
+	    	String path = canonicalURL.getPath();
+
+	    	/*
+	    	 * Normalize: no empty segments (i.e., "//"), no segments equal to
+	    	 * ".", and no segments equal to ".." that are preceded by a segment
+	    	 * not equal to "..".
+	    	 */
+	    	path = new URI(path.replace("\\", "/")).normalize().toString();
+
+	    	/*
+	    	 * Convert '//' -> '/'
+	    	 */
+	    	int idx = path.indexOf("//");
+	    	while (idx >= 0) 
+	    	{
+	    		path = path.replace("//", "/");
+	    		idx = path.indexOf("//");
+	    	}
+	    	
+	    	/*
+	    	 * Drop starting '/../'
+	    	 */
+	    	while (path.startsWith("/../")) 
+	    	{
+	    		path = path.substring(3);
+	    	}
+
+	    	/*
+	    	 * Trim
+	    	 */
+	    	path = path.trim();
+
+	    	final SortedMap<String, String> params = createParameterMap(canonicalURL.getQuery());
+	    	final String queryString;
+
+	    	if (params != null && params.size() > 0) 
+	    	{
+	    		String canonicalParams = canonicalize(params);
+	    		queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);
+	    	} 
+	    	else 
+	    	{
+	    		queryString = "";
+	    	}
+
+	    	/*
+	    	 * Add starting slash if needed
+	    	 */
+	    	if (path.length() == 0) 
+	    	{
+	    		path = "/" + path;
+	    	}
+
+	    	/*
+	    	 * Drop default port: example.com:80 -> example.com
+	    	 */
+	    	int port = canonicalURL.getPort();
+	    	if (port == canonicalURL.getDefaultPort()) 
+	    	{
+	    		port = -1;
+	    	}
+
+	    	String protocol = canonicalURL.getProtocol().toLowerCase();
+	    	String pathAndQueryString = normalizePath(path) + queryString;
+
+	    	URL result = new URL(protocol, host, port, pathAndQueryString);
+	    	return result.toExternalForm();
+
+	    	}
+	    	catch (MalformedURLException ex) 
+	    	{
+	    		return null;
+	    	}
+	    	catch (URISyntaxException ex) 
+	    	{
+	    		return null;
+	    	}
+	}
+	
+	 /**
+	   * Takes a query string, separates the constituent name-value pairs, and
+	   * stores them in a SortedMap ordered by lexicographical order.
+	   *
+	   * @return Null if there is no query string.
+	   */
+	  private static SortedMap<String, String> createParameterMap(final String queryString) 
+	  {
+		  if (queryString == null || queryString.isEmpty()) 
+		  {
+			  return null;
+		  }
+
+		  final String[] pairs = queryString.split("&");
+		  final Map<String, String> params = new HashMap<>(pairs.length);
+
+		  for (final String pair : pairs) 
+		  {
+			  if (pair.length() == 0) 
+			  {
+				  continue;
+			  }
+
+			  String[] tokens = pair.split("=", 2);
+			  switch (tokens.length) 
+			  {
+			  	case 1:
+			  		if (pair.charAt(0) == '=') 
+			  		{
+			  			params.put("", tokens[0]);
+			  		}
+			  		else 
+			  		{
+			  			params.put(tokens[0], "");
+			  		}
+			  		break;
+			  	case 2:
+			  		params.put(tokens[0], tokens[1]);
+			  		break;
+			  }
+		  }
+		  return new TreeMap<>(params);
+	  	}
+
+	  /**
+	   * Canonicalize the query string.
+	   *
+	   * @param sortedParamMap
+	   *            Parameter name-value pairs in lexicographical order.
+	   * @return Canonical form of query string.
+	   */
+	  private static String canonicalize(final SortedMap<String, String> sortedParamMap) 
+	  {
+		  if (sortedParamMap == null || sortedParamMap.isEmpty()) 
+		  {
+			  return "";
+		  }
+
+		  final StringBuffer sb = new StringBuffer(100);
+		  for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) 
+		  {
+			  final String key = pair.getKey().toLowerCase();
+			  if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid")) 
+			  {
+				  continue;
+			  }
+			  if (sb.length() > 0) 
+			  {
+				  sb.append('&');
+			  }
+			  sb.append(percentEncodeRfc3986(pair.getKey()));
+			  if (!pair.getValue().isEmpty()) 
+			  {
+				  sb.append('=');
+				  sb.append(percentEncodeRfc3986(pair.getValue()));
+			  }
+		  }
+		  return sb.toString();
+	  	}
+	  /**
+	   * Percent-encode values according the RFC 3986. The built-in Java
+	   * URLEncoder does not encode according to the RFC, so we make the extra
+	   * replacements.
+	   *
+	   * @param string
+	   *            Decoded string.
+	   * @return Encoded string per RFC 3986.
+	   */
+	  private static String percentEncodeRfc3986(String string) 
+	  {
+	    try 
+	    {
+	    	string = string.replace("+", "%2B");
+	    	string = URLDecoder.decode(string, "UTF-8");
+	    	string = URLEncoder.encode(string, "UTF-8");
+	    	return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
+	    }
+	    catch (Exception e) 
+	    {
+	    	return string;
+	    }
+	  }
+
+	  private static String normalizePath(final String path) 
+	  {
+		  return path.replace("%7E", "~").replace(" ", "%20");
+	  }
+}
diff --git a/src/WebCrawlThread.java b/src/WebCrawlThread.java
new file mode 100644
index 0000000..b97f890
--- /dev/null
+++ b/src/WebCrawlThread.java
@@ -0,0 +1,366 @@
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Thread crawls from a start url
+ * @author zona
+ *
+ */
+public class WebCrawlThread extends Thread
+{
+	//absolute maximum number of urls to crawl
+	final int MAX_NUM_LINKS_TO_CRAWL = 1000;
+	final int MAX_DEPTH = 1000;
+	private ContentCrawlType crawlMode;
+	private String startUrl;
+	private int threadID;
+	private SynchronizedManager synchronizedManager;
+		
+	//cache with disallowed urls as per the robots.txt file. The key is the host url
+	//i.e. www.cnn.com, and the values are all urls disallowed for www.cnn.com
+	private static Map<String, List<String>> disallowListCache = new  HashMap<>();
+	
+	// Crawl lists
+	private static HashSet<String> crawledUrls = new HashSet<String>();
+	private static LinkedHashSet<String> urlsToCrawl = new LinkedHashSet<String>();
+	
+	public WebCrawlThread(ContentCrawlType aCrawlMode,String aStartUrl, int aThreadID)
+	{
+		this.crawlMode = aCrawlMode;
+		this.startUrl = aStartUrl;
+		this.threadID = aThreadID;
+		this.synchronizedManager = new SynchronizedManager();
+	}
+	
+	public void init()
+	{		
+		this.start();
+	}
+	
+	
+	// Check if robot is allowed to access the given URL. 
+	//Different threads can be accessing the map so make it synchronized to prevent race conditions
+	//But it makes it a bottleneck
+	/**
+	 * 
+	 * @param urlToCheck
+	 * @return
+	 */
+	private synchronized boolean isRobotAllowed(URL urlToCheck) 
+	{
+		String host = urlToCheck.getHost().toLowerCase();
+		
+		// Retrieve host's disallow list from cache.
+		List<String> disallowList = disallowListCache.get(host);
+	       
+		// If list is not in the cache, download and cache it.
+		if (disallowList == null) 
+		{
+			disallowList = new ArrayList<String>();
+			try 
+			{
+				URL robotsFileUrl =	new URL("http://" + host + "/robots.txt");
+	            
+	            // Open connection to robot file URL for reading.
+	            BufferedReader reader =  new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
+	            
+	            // Read robot file, creating list of disallowed paths.
+	            String line;
+	            while ((line = reader.readLine()) != null) 
+	            {
+	            	if (line.toLowerCase().indexOf("Disallow:") == 0) 
+	            	{
+	            		String disallowPath =  line.toLowerCase().substring("Disallow:".length());
+	            		// Check disallow path for comments and remove if present.
+	            		int commentIndex = disallowPath.indexOf("#");
+	            		if (commentIndex != - 1) 
+	            		{
+	            			disallowPath = disallowPath.substring(0, commentIndex);
+	            		}
+	            		// Remove leading or trailing spaces from disallow path.
+	            		disallowPath = disallowPath.trim();
+	            		// Add disallow path to list.
+	            		disallowList.add(disallowPath);
+	            	}
+	            }
+	            // Add new disallow list to cache.
+	            disallowListCache.put(host, disallowList);	     
+			}
+			catch (Exception e) 
+			{
+				 //Assume robot is allowed since an exception
+	             //is thrown if the robot file doesn't exist. 
+	             return true;
+			}
+		}
+		
+		//Loop through disallow list to see if
+	    //crawling is allowed for the given URL. 
+		String file = urlToCheck.getFile();
+		for (int i = 0; i < disallowList.size(); i++) 
+		{
+			String disallow = (String) disallowList.get(i);
+			if (file.startsWith(disallow)) 
+			{
+	             return false;
+			}
+		}
+		return true;
+	}
+	  
+	/**
+	 * Fetches html for given url
+	 * @param pageUrl
+	 * @return
+	 */
+	private String fetchPageContent(URL pageUrl) 
+	{		
+		try 
+		{
+			// Open a connection to the URL and send a HEAD request
+			HttpURLConnection.setFollowRedirects(false);
+			HttpURLConnection con = (HttpURLConnection)pageUrl.openConnection();
+			con.setAllowUserInteraction(true);
+			con.setRequestMethod("HEAD");
+			con.setDoOutput(true);
+			con.setConnectTimeout(5000);
+			con.setReadTimeout(5000);
+			con.connect();
+			
+			// Check if the page exists and if it is an HTML file
+			int code = con.getResponseCode();
+			String type = con.getContentType();
+			
+			con.disconnect();
+			
+			if (code != HttpURLConnection.HTTP_OK || !type.contains("text/html")) 
+			{
+				return null;
+			}
+			
+	 		// Open a connection to download the page content
+			InputStream pageStream = pageUrl.openStream();
+	 		BufferedReader reader = new BufferedReader(new InputStreamReader(pageStream));
+			
+	 		// Read the page line by line and write into the buffer
+			String line;
+			StringBuffer pageBuffer = new StringBuffer();
+			
+			while ((line = reader.readLine()) != null) 
+			{
+				pageBuffer.append(line);
+			}
+			pageStream.close();
+			reader.close();
+			
+			// Return page content as a string
+			return pageBuffer.toString();
+			
+		} 
+		
+		catch (Exception e) 
+		{
+			return null;
+		}
+	}
+	
+	
+	private List<String> extractLinks(URL pageUrl, String pageContent, HashSet<String> crawledUrls) 
+	{		
+		// Create the regular expression for matching URLs
+		//Starts with <a, followed by one or more white space chars, then href followed by 0 or more white
+		//space chars, =
+		Pattern pattern = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]", Pattern.CASE_INSENSITIVE);
+		Matcher matcher = pattern.matcher(pageContent);
+		
+		// Create the list of extracted links
+		List<String> linkList = new ArrayList<String>();
+		int numLinksAdded =0;
+		while (matcher.find()) 
+		{			
+			// Get the string inside the anchor href attribute
+			String link = Utils.getCanonicalURL(matcher.group(1).trim());
+			
+			// Skip empty links
+			if (link == null || link.isEmpty()) 
+			{
+				continue;
+			}
+			
+			// Skip links that are just page anchors
+			if (link.charAt(0) == '#') 
+			{
+				continue;
+			}		
+			
+			// Skip mailto links
+			if (link.toLowerCase().contains("mailto:")) 
+			{
+				continue;
+			}
+			
+			// Skip JavaScript links
+			if (link.toLowerCase().contains("javascript:")) 
+			{
+				continue;
+			}
+			
+			
+			// Construct absolute from relative URLs if necessary
+			if (!link.contains("://")) 
+			{			
+				if (link.charAt(0) == '/') 
+				{
+					link = pageUrl.getProtocol() + "://" + pageUrl.getHost() + link;
+				}
+				else if (link.startsWith("../")) 
+				{
+					try 
+					{
+						URL absolute = new URL(pageUrl, link); 
+						link = absolute.toString();
+					} 
+					catch (MalformedURLException e) 
+					{
+						link = "not valid";
+					}
+				}
+				else 
+				{
+					String fileName = pageUrl.getFile();
+					String linkBase = pageUrl.getProtocol() + "://" + pageUrl.getHost();
+					
+					if (!fileName.contains("/")) 
+					{
+						link = linkBase + "/" + link;
+					} 
+					else 
+					{
+						String path = fileName.substring(0, fileName.lastIndexOf('/') + 1);
+						link = linkBase + path + link;
+					}
+				}
+			}
+			
+			// If the link contains a named anchor, remove it
+			int index = link.indexOf('#');
+			if (index != -1) 
+			{
+				  link = link.substring(0, index);
+			}
+			
+			//skip if it is the same as page url
+			if (Utils.removeTrailingSlash(link).toLowerCase().equals(pageUrl.toString().toLowerCase()))
+			{
+				continue;
+			}
+			
+			// Verify the link and skip if invalid
+			URL checkedLink = Utils.checkUrl(link);
+			if (checkedLink == null) 
+			{
+				continue;
+			}
+			//skip links outside domain if crawling in focused domain mode
+			if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN && !link.toLowerCase().contains(startUrl))
+			{
+				//System.out.println("(Thread " + threadID + ") Not crawling " + link + " (out of domain " + startUrl + ")");
+				continue;
+			}
+			// Skip the link if it has already been crawled
+			if (synchronizedManager.crawledUrlsSetContainsLink(link, crawledUrls)) 
+			{
+				continue;
+			}
+			
+			// Add the link to the link list try to limit depth 
+			if (numLinksAdded< MAX_DEPTH)
+			{
+				linkList.add(link);
+				numLinksAdded++;
+			}
+			else
+			{
+				break;
+			}
+		}
+		// Return the list of links found on the page
+		return linkList;
+	}
+	
+	
+	public void crawl() 
+	{		
+		System.out.println("\n Thread " + threadID + " is starting crawling...\n");
+		
+		long startTime = System.currentTimeMillis();
+		
+		// Add the start URL to the list of URLs to crawl
+		synchronizedManager.addToUrlsToCrawlSet(startUrl, urlsToCrawl);
+		
+		// Search until the number of found URLs reaches MAX_NUM_LINKS_TO_CRAWL or there are no more urls to crawl
+		while (synchronizedManager.keepCrawling(urlsToCrawl, crawledUrls, MAX_NUM_LINKS_TO_CRAWL) )
+		{
+			// Get the URL
+			String url = synchronizedManager.getFromUrlsToCrawlSet(urlsToCrawl, crawledUrls);
+			
+			// Check and convert the URL string to the URL object
+			URL checkedUrl = Utils.checkUrl(url);
+			
+			// Skip URL if robots are not allowed to access it.
+			if (checkedUrl != null && isRobotAllowed(checkedUrl)) 
+			{
+				// Download the page at the URL
+				String pageContent = fetchPageContent(checkedUrl);
+			 	if (pageContent != null && !pageContent.isEmpty()) 
+			   	{					
+			   		// Extract valid links from the page
+					List<String> links = extractLinks(checkedUrl, pageContent, crawledUrls);
+					
+					// Add the links to the list of URLs to crawl
+					if(!links.isEmpty()) 
+					{
+						synchronizedManager.addAllToUrlsToCrawlSet(links, urlsToCrawl);
+					}
+					
+//				   	// Add the page to the list of crawled URLs
+//				 	crawledUrls.add(url);
+				   	
+				 	// Display the crawled URL
+				   	System.out.println("(Thread " + threadID + ") " + url);
+			   	}
+			}			
+		}
+		if(synchronizedManager.crawledUrlsSetSize(crawledUrls)>0) 
+		{			
+			long endTime = System.currentTimeMillis();
+			DateFormat formatter = new SimpleDateFormat("mm:ss");
+			String totalTime = formatter.format(endTime - startTime);
+			
+			System.out.println("\n (Thread " +  threadID + ") Done. " + synchronizedManager.crawledUrlsSetSize(crawledUrls) + " URLs found. Total time: " + totalTime);
+		}
+		else
+			System.out.println("No valid URL could be found.");
+	}
+	
+	@Override
+    public void run()
+    {
+		crawl();
+    }
+}
diff --git a/src/WebCrawlerMain.java b/src/WebCrawlerMain.java
new file mode 100644
index 0000000..b6faebb
--- /dev/null
+++ b/src/WebCrawlerMain.java
@@ -0,0 +1,114 @@
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+
+public class WebCrawlerMain 
+{
+	private static ContentCrawlType crawlMode;
+	private static final int NUM_SEARCH_RESULTS = 5;//how many google search results to use in topical crawling
+	
+	public static void main(String[] args) throws IOException 
+	{    	
+		InputStreamReader converter = new InputStreamReader(System.in);
+		BufferedReader in = new BufferedReader(converter);
+		System.out.println("Enter the type of crawling: 1 (General Crawling), 2 (Focused domain Crawling) or 3 (Focused topic Crawling).");
+		crawlMode = ContentCrawlType.GENERAL_CRAWL;
+		while (true)
+		{
+			String mode = in.readLine();
+			if (!"1".equals(mode.trim()) && !"2".equals(mode.trim()) && !"3".equals(mode.trim()))
+				System.out.println("Invalid mode. Enter 1 (General), 2( Focused Domain) or 3 (Focused Topic)");
+			else
+			{
+				if ("2".equals(mode.trim()))
+					crawlMode = ContentCrawlType.FOCUSED_CRAWL_DOMAIN;
+				else if ("3".equals(mode.trim()))
+					crawlMode = ContentCrawlType.FOCUSED_CRAWL_TOPIC;
+				break;
+			}
+		}
+		
+		List<WebCrawlThread> crawlThreads = new ArrayList<>();
+		if (crawlMode == ContentCrawlType.GENERAL_CRAWL || crawlMode == ContentCrawlType.FOCUSED_CRAWL_DOMAIN)
+		{
+			String startUrl = null;
+			System.out.println("Enter the URL where to start crawling from:");
+			int startUrlNum = 0;
+			while (true) 
+			{
+				startUrl = in.readLine();
+				if (!"q".equals(startUrl.trim()) && Utils.checkUrl(startUrl) != null)
+				{
+					startUrlNum++;
+					// create a thread for crawling
+					crawlThreads.add(new WebCrawlThread(crawlMode, startUrl, startUrlNum));
+					System.out.println("Another URL? (q to quit)");
+				}
+				else if ("q".equals(startUrl.trim()))
+				{
+					break;
+				}
+				else
+					System.out.println("The given URL is not valid. Please enter a valid URL or q to quit.");
+			}
+			
+			// Start crawling
+			for (WebCrawlThread crawlThread : crawlThreads)
+			{
+				crawlThread.init();
+			}
+		}		
+		else if (crawlMode == ContentCrawlType.FOCUSED_CRAWL_TOPIC)//focused topic crawler
+		{
+			String search;
+			System.out.println("Enter your topic");
+			
+			search = in.readLine();
+			//do google search on topic			
+			String googleQuery = "http://www.google.com/search?q=";
+			String charset = "UTF-8";
+			//Chrome
+			String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36";
+			Elements links = Jsoup.connect(googleQuery +  URLEncoder.encode(search, charset)).userAgent(userAgent).get().select("a[href]");
+			System.out.println("Crawling through top " + NUM_SEARCH_RESULTS + " Google search results for topic " + search);
+			int intCountResults = 0;
+			//crawl the first NUM_SEARCH_RESULTS links
+			for (Element link : links) 
+			{
+			    String title = link.text();
+			    String startUrl = link.absUrl("href"); // Google returns URLs in format "http://www.google.com/url?q=<url>&sa=U&ei=<someKey>".
+			    
+			    //url = URLDecoder.decode(url.substring(url.indexOf('=') + 1, url.indexOf('&')), "UTF-8");
+
+			    if (!startUrl.startsWith("http") || startUrl.contains("google.com")) 
+			    {
+			        continue; // Ads/news/etc.
+			    }
+			   
+			    System.out.println("Title: " + title);
+			    System.out.println("URL: " + startUrl);
+			    
+			    // Start crawling
+			    if (Utils.checkUrl(startUrl) != null && intCountResults < NUM_SEARCH_RESULTS)
+			    {
+			    	 intCountResults++;
+			    	// Start a thread for crawling
+					WebCrawlThread crawlThread = new WebCrawlThread(crawlMode, startUrl,intCountResults );
+					crawlThread.init();
+			    }
+			    else
+			    {
+			    	break;
+			    }
+			}
+		}
+	}
+}