Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
zonagit committed Sep 4, 2014
0 parents commit 979c1b9
Show file tree
Hide file tree
Showing 12 changed files with 1,329 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
<classpathentry kind="lib" path="C:/EclipseLuna/BigData/WebCrawler/external/lib/jsoup-1.7.3.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/bin/
17 changes: 17 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>WebCrawler</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
11 changes: 11 additions & 0 deletions .settings/org.eclipse.jdt.core.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
org.eclipse.jdt.core.compiler.compliance=1.7
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.7
Binary file added external/lib/jsoup-1.7.3.jar
Binary file not shown.
55 changes: 55 additions & 0 deletions readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
To compile from the command line and assuming that your path variable is pointing to the jdk bin folder, and your JAVA_HOME is
also set, run from the src folder

javac -cp ".;../external/lib/jsoup-1.7.3.jar" *.java

Then run from the same folder with

java WebCrawlerMain

Even better import the project into eclipse and run it from there.

The crawling algorithm is breadth first search when single threaded

If user enters more than one base url to crawl, a thread will be spawned for each. Threads can access any of the links
found by any other thread.

Adjustable params:

In WebCrawlerMain.java

private static final int NUM_SEARCH_RESULTS = 3;

controls how many google search results to use to seed the topical crawling
(an equal number of threads will be spawned)

In WebCrawlThread.java

final int MAX_NUM_LINKS_TO_CRAWL = 1000;

controls the maximum number of urls to crawl

and

final int MAX_DEPTH = 1000;

how many links from each page to retrieve

The data structures

private static HashSet<String> crawledUrls = new HashSet<String>();
private static LinkedHashSet<String> urlsToCrawl = new LinkedHashSet<String>();

in WebCrawlThread.java are shared by all threads and access to them is done via synchronized methods to prevent race conditions
among the threads. In particular the method getFromUrlsToCrawlSet will retrieve the next url from the LinkedHashSet urlsToCrawl
(this set keeps insertion order), remove it and added it to the set crawledUrls.

Since all of this method is executed by a single thread this guarantees that no two threads crawl the same url.








6 changes: 6 additions & 0 deletions src/ContentCrawlType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
public enum ContentCrawlType
{
GENERAL_CRAWL,
FOCUSED_CRAWL_DOMAIN,
FOCUSED_CRAWL_TOPIC;
}
43 changes: 43 additions & 0 deletions src/SynchronizedManager.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;

public class SynchronizedManager
{
synchronized void addToUrlsToCrawlSet(String url, LinkedHashSet<String> urlsToCrawl)
{
urlsToCrawl.add(url);
}

synchronized void addAllToUrlsToCrawlSet(List<String> links, LinkedHashSet<String> urlsToCrawl)
{
urlsToCrawl.addAll(links);
}

synchronized String getFromUrlsToCrawlSet(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls)
{
String url = urlsToCrawl.iterator().next();
//remove the page from the list of urls to crawl
urlsToCrawl.remove(url);

// Add the page to the list of crawled URLs so that this page is not crawled again by a different thread
crawledUrls.add(url);

return url;
}

synchronized boolean keepCrawling(LinkedHashSet<String> urlsToCrawl, HashSet<String> crawledUrls, int maxLinksToCrawl)
{
return !urlsToCrawl.isEmpty() && crawledUrls.size() <= maxLinksToCrawl;
}

synchronized boolean crawledUrlsSetContainsLink(String link, HashSet<String> crawledUrls)
{
return crawledUrls.contains(link);
}

synchronized int crawledUrlsSetSize(HashSet<String> crawledUrls)
{
return crawledUrls.size();
}
}
Loading

0 comments on commit 979c1b9

Please sign in to comment.