-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 979c1b9
Showing
12 changed files
with
1,329 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<classpath> | ||
<classpathentry kind="src" path="src"/> | ||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/> | ||
<classpathentry kind="lib" path="C:/EclipseLuna/BigData/WebCrawler/external/lib/jsoup-1.7.3.jar"/> | ||
<classpathentry kind="output" path="bin"/> | ||
</classpath> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/bin/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<projectDescription> | ||
<name>WebCrawler</name> | ||
<comment></comment> | ||
<projects> | ||
</projects> | ||
<buildSpec> | ||
<buildCommand> | ||
<name>org.eclipse.jdt.core.javabuilder</name> | ||
<arguments> | ||
</arguments> | ||
</buildCommand> | ||
</buildSpec> | ||
<natures> | ||
<nature>org.eclipse.jdt.core.javanature</nature> | ||
</natures> | ||
</projectDescription> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
eclipse.preferences.version=1 | ||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled | ||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 | ||
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve | ||
org.eclipse.jdt.core.compiler.compliance=1.7 | ||
org.eclipse.jdt.core.compiler.debug.lineNumber=generate | ||
org.eclipse.jdt.core.compiler.debug.localVariable=generate | ||
org.eclipse.jdt.core.compiler.debug.sourceFile=generate | ||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error | ||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error | ||
org.eclipse.jdt.core.compiler.source=1.7 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is | ||
also set, run from the src folder | ||
|
||
javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java | ||
|
||
Then run from the same folder with | ||
|
||
java WebCrawlerMain | ||
|
||
Even better import the project into eclipse and run it from there. | ||
|
||
The crawling algorithm is breadth first search when single threaded | ||
|
||
If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links | ||
found by any other thread. | ||
|
||
Adjustable params: | ||
|
||
In WebCrawlerMain.java | ||
|
||
private static final int NUM_SEARCH_RESULTS = 3; | ||
|
||
controls how many google search results to use to seed the topical crawling | ||
(an equal number of threads will be spawned) | ||
|
||
In WebCrawlThread.java | ||
|
||
final int MAX_NUM_LINKS_TO_CRAWL = 1000; | ||
|
||
controls the maximum number of urls to crawl | ||
|
||
and | ||
|
||
final int MAX_DEPTH = 1000; | ||
|
||
how many links from each page to retrieve | ||
|
||
The data structures | ||
|
||
private static HashSet<String> crawledUrls = new HashSet<String>(); | ||
private static LinkedHashSet<String> urlsToCrawl = new LinkedHashSet<String>(); | ||
|
||
in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions | ||
among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl | ||
(this set keeps insertion order), remove it and added it to the set crawledUrls. | ||
|
||
Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url. | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
public enum ContentCrawlType | ||
{ | ||
GENERAL_CRAWL, | ||
FOCUSED_CRAWL_DOMAIN, | ||
FOCUSED_CRAWL_TOPIC; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import java.util.HashSet; | ||
import java.util.LinkedHashSet; | ||
import java.util.List; | ||
|
||
public class SynchronizedManager | ||
{ | ||
synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet<String> urlsToCrawl) | ||
{ | ||
urlsToCrawl.add(url); | ||
} | ||
|
||
synchronized void addAllToUrlsToCrawlSet(List<String> links, LinkedHashSet<String> urlsToCrawl) | ||
{ | ||
urlsToCrawl.addAll(links); | ||
} | ||
|
||
synchronized String getFromUrlsToCrawlSet(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls) | ||
{ | ||
String url = urlsToCrawl.iterator().next(); | ||
//remove the page from the list of urls to crawl | ||
urlsToCrawl.remove(url); | ||
|
||
// Add the page to the list of crawled URLs so that this page is not crawled again by a different thread | ||
crawledUrls.add(url); | ||
|
||
return url; | ||
} | ||
|
||
synchronized boolean keepCrawling(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls, int maxLinksToCrawl) | ||
{ | ||
return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl; | ||
} | ||
|
||
synchronized boolean crawledUrlsSetContainsLink(String link, HashSet<String> crawledUrls) | ||
{ | ||
return crawledUrls.contains(link); | ||
} | ||
|
||
synchronized int crawledUrlsSetSize(HashSet<String> crawledUrls) | ||
{ | ||
return crawledUrls.size(); | ||
} | ||
} |
Oops, something went wrong.