first commit

zonagit · Sep 4, 2014 · 979c1b9 · 979c1b9
commit 979c1b9
Show file tree

Hide file tree

Showing 12 changed files with 1,329 additions and 0 deletions.
diff --git a/.classpath b/.classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
+	<classpathentry kind="lib" path="C:/EclipseLuna/BigData/WebCrawler/external/lib/jsoup-1.7.3.jar"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/bin/
diff --git a/.project b/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>WebCrawler</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/external/lib/jsoup-1.7.3.jar b/external/lib/jsoup-1.7.3.jar
diff --git a/readme.txt b/readme.txt
@@ -0,0 +1,55 @@
+To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is
+also set, run from the src folder
+
+javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java
+
+Then run from the same folder with
+
+java WebCrawlerMain
+
+Even better import the project into eclipse and run it from there.
+
+The crawling algorithm is breadth first search when single threaded
+
+If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links
+found by any other thread.
+
+Adjustable params:
+
+In WebCrawlerMain.java
+
+private static final int NUM_SEARCH_RESULTS = 3;
+
+controls how many google search results to use to seed the topical crawling
+(an equal number of threads will be spawned)
+
+In WebCrawlThread.java
+
+final int MAX_NUM_LINKS_TO_CRAWL = 1000;
+
+controls the maximum number of urls to crawl 
+
+and
+
+final int MAX_DEPTH = 1000;
+
+how many links from each page to retrieve
+
+The data structures 
+
+private static HashSet<String> crawledUrls = new HashSet<String>();
+private static LinkedHashSet<String> urlsToCrawl = new LinkedHashSet<String>();
+
+in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions
+among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl
+(this set keeps insertion order), remove it and added it to the set crawledUrls.
+
+Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url.
+
+
+
+
+
+
+
+
diff --git a/src/ContentCrawlType.java b/src/ContentCrawlType.java
@@ -0,0 +1,6 @@
+public enum ContentCrawlType 
+{
+	GENERAL_CRAWL, 
+	FOCUSED_CRAWL_DOMAIN,
+	FOCUSED_CRAWL_TOPIC;		
+}
diff --git a/src/SynchronizedManager.java b/src/SynchronizedManager.java
@@ -0,0 +1,43 @@
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+
+public class SynchronizedManager 
+{
+	synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet<String> urlsToCrawl)
+	{
+		urlsToCrawl.add(url);
+	}
+
+	synchronized void addAllToUrlsToCrawlSet(List<String> links, LinkedHashSet<String> urlsToCrawl)
+	{
+		urlsToCrawl.addAll(links);
+	}	
+
+	synchronized String getFromUrlsToCrawlSet(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls)
+	{
+		String url =  urlsToCrawl.iterator().next();
+		//remove the page from the list of urls to crawl
+		urlsToCrawl.remove(url);
+
+		// Add the page to the list of crawled URLs so that this page is not crawled again by a different thread
+		crawledUrls.add(url);
+
+		return url;
+	}
+
+	synchronized boolean keepCrawling(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls, int maxLinksToCrawl)
+	{
+		return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl;
+	}
+
+	synchronized boolean crawledUrlsSetContainsLink(String link, HashSet<String> crawledUrls)
+	{
+		return crawledUrls.contains(link);
+	}
+
+	synchronized int crawledUrlsSetSize(HashSet<String> crawledUrls)
+	{
+		return crawledUrls.size();
+	}
+}