diff --git a/.gitignore b/.gitignore
index 615c5c5c9..a7718a6a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ target/
/goose.iml
/goose.ipr
/goose.iws
+.idea
\ No newline at end of file
diff --git a/src/main/scala/com/gravity/goose/Article.scala b/src/main/scala/com/gravity/goose/Article.scala
index 40b4dadcd..6e0bddaa2 100644
--- a/src/main/scala/com/gravity/goose/Article.scala
+++ b/src/main/scala/com/gravity/goose/Article.scala
@@ -22,6 +22,7 @@ import images.Image
import org.jsoup.nodes.{Element, Document}
import java.util.Date
import scala.collection._
+import com.gravity.goose.opengraph.OpenGraphData
/**
* Created by Jim Plush
@@ -122,4 +123,9 @@ class Article {
* @return a {@link Map Map<String,String>} of property name to property vaue (represented as a {@link String}.
*/
var additionalData: Map[String, String] = Map.empty
+
+ /**
+ * Facebook Open Graph data that that is found in Article Meta tags
+ */
+ var openGraphData: OpenGraphData = null
}
\ No newline at end of file
diff --git a/src/main/scala/com/gravity/goose/Configuration.scala b/src/main/scala/com/gravity/goose/Configuration.scala
index 20ce4653a..ca64a1b4d 100644
--- a/src/main/scala/com/gravity/goose/Configuration.scala
+++ b/src/main/scala/com/gravity/goose/Configuration.scala
@@ -22,7 +22,7 @@ import network.{HtmlFetcher, AbstractHtmlFetcher}
import org.jsoup.nodes.Element
import java.util.Date
import reflect.BeanProperty
-import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor}
+import com.gravity.goose.extractors._
/**
@@ -115,6 +115,12 @@ class Configuration {
this.additionalDataExtractor = extractor
}
+ var openGraphDataExtractor: OpenGraphDataExtractor = new OpenGraphDataExtractor
+
+ def getOpenGraphDataExtractor: OpenGraphDataExtractor = {
+ openGraphDataExtractor
+ }
+
var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher
def setHtmlFetcher(fetcher: AbstractHtmlFetcher) {
diff --git a/src/main/scala/com/gravity/goose/Crawler.scala b/src/main/scala/com/gravity/goose/Crawler.scala
index 4f3b32344..4ac3a9c16 100644
--- a/src/main/scala/com/gravity/goose/Crawler.scala
+++ b/src/main/scala/com/gravity/goose/Crawler.scala
@@ -67,6 +67,7 @@ class Crawler(config: Configuration) {
article.metaKeywords = extractor.getMetaKeywords(article)
article.canonicalLink = extractor.getCanonicalLink(article)
article.tags = extractor.extractTags(article)
+ article.openGraphData = config.getOpenGraphDataExtractor.extract(doc)
// before we do any calcs on the body itself let's clean up the document
article.doc = docCleaner.clean(article)
diff --git a/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala
new file mode 100644
index 000000000..77b8dc7e1
--- /dev/null
+++ b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala
@@ -0,0 +1,50 @@
+/**
+Copyright [2014] Robby Pond
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package com.gravity.goose.extractors
+
+import org.jsoup.nodes.Element
+
+import scala.collection.JavaConversions._
+import com.gravity.goose.opengraph.OpenGraphData
+
+class OpenGraphDataExtractor extends Extractor[OpenGraphData] {
+
+ def extract(rootElement: Element): OpenGraphData = {
+ val openGraphData: OpenGraphData = new OpenGraphData
+ val elements : scala.collection.mutable.Buffer[Element] = rootElement.select("meta")
+ for(el <- elements) {
+ if(el.attr("property") == "og:title")
+ openGraphData.title = el.attr("content")
+ if(el.attr("property") == "og:site_name")
+ openGraphData.siteName = el.attr("content")
+ if(el.attr("property") == "og:url")
+ openGraphData.url = el.attr("content")
+ if(el.attr("property") == "og:description")
+ openGraphData.description = el.attr("content")
+ if(el.attr("property") == "og:image")
+ openGraphData.image = el.attr("content")
+ if(el.attr("property") == "og:type")
+ openGraphData.ogType = el.attr("content")
+ if(el.attr("property") == "og:locale")
+ openGraphData.locale = el.attr("content")
+ if(el.attr("property") == "article:author")
+ openGraphData.author = el.attr("content")
+ if(el.attr("property") == "article:publisher")
+ openGraphData.publisher = el.attr("content")
+ }
+ openGraphData
+ }
+}
\ No newline at end of file
diff --git a/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala
new file mode 100644
index 000000000..0e9d8cfe6
--- /dev/null
+++ b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala
@@ -0,0 +1,29 @@
+/**
+Copyright [2014] Robby Pond
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package com.gravity.goose.opengraph;
+
+class OpenGraphData {
+
+ var title: String = ""
+ var siteName: String = ""
+ var url: String = ""
+ var description: String = ""
+ var image: String = ""
+ var ogType: String = ""
+ var locale: String = "en_US"
+ var author: String = ""
+ var publisher: String = ""
+}
diff --git a/src/test/scala/com/gravity/goose/OpenGraphTest.scala b/src/test/scala/com/gravity/goose/OpenGraphTest.scala
new file mode 100644
index 000000000..9fe20250e
--- /dev/null
+++ b/src/test/scala/com/gravity/goose/OpenGraphTest.scala
@@ -0,0 +1,32 @@
+package com.gravity.goose
+
+import org.junit.Test
+import org.junit.Assert._
+
+class OpenGraphTest {
+
+ @Test
+ def openGraph() {
+ implicit val config = TestUtils.NO_IMAGE_CONFIG
+ // og tags for http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html
+ /*
+
+
+
+
+
+ */
+ val url: String = "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html"
+ val article = TestUtils.getArticle(url)
+ assertEquals("og:description was not as expected!", article.openGraphData.description,
+ "A 'world's hottest chilli' competition at a curry restaurant left two people in hospital.")
+ assertEquals("og:title was not as expected!", article.openGraphData.title,
+ "World's hottest chilli contest leaves two in hospital - Telegraph")
+ assertEquals("og:url was not as expected!", article.openGraphData.url,
+ "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html")
+ assertEquals("og:image was not as expected!", article.openGraphData.image,
+ "http://i.telegraph.co.uk/multimedia/archive/02018/Kismot-Killer_2018476a.jpg")
+ assertEquals("og:type was not as expected!", article.openGraphData.ogType,
+ "article")
+ }
+}
\ No newline at end of file