diff --git a/.gitignore b/.gitignore index 615c5c5c9..a7718a6a8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ target/ /goose.iml /goose.ipr /goose.iws +.idea \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/Article.scala b/src/main/scala/com/gravity/goose/Article.scala index 40b4dadcd..6e0bddaa2 100644 --- a/src/main/scala/com/gravity/goose/Article.scala +++ b/src/main/scala/com/gravity/goose/Article.scala @@ -22,6 +22,7 @@ import images.Image import org.jsoup.nodes.{Element, Document} import java.util.Date import scala.collection._ +import com.gravity.goose.opengraph.OpenGraphData /** * Created by Jim Plush @@ -122,4 +123,9 @@ class Article { * @return a {@link Map Map<String,String>} of property name to property vaue (represented as a {@link String}. */ var additionalData: Map[String, String] = Map.empty + + /** + * Facebook Open Graph data that that is found in Article Meta tags + */ + var openGraphData: OpenGraphData = null } \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/Configuration.scala b/src/main/scala/com/gravity/goose/Configuration.scala index 20ce4653a..ca64a1b4d 100644 --- a/src/main/scala/com/gravity/goose/Configuration.scala +++ b/src/main/scala/com/gravity/goose/Configuration.scala @@ -22,7 +22,7 @@ import network.{HtmlFetcher, AbstractHtmlFetcher} import org.jsoup.nodes.Element import java.util.Date import reflect.BeanProperty -import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor} +import com.gravity.goose.extractors._ /** @@ -115,6 +115,12 @@ class Configuration { this.additionalDataExtractor = extractor } + var openGraphDataExtractor: OpenGraphDataExtractor = new OpenGraphDataExtractor + + def getOpenGraphDataExtractor: OpenGraphDataExtractor = { + openGraphDataExtractor + } + var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher def setHtmlFetcher(fetcher: AbstractHtmlFetcher) { diff --git a/src/main/scala/com/gravity/goose/Crawler.scala b/src/main/scala/com/gravity/goose/Crawler.scala index 4f3b32344..4ac3a9c16 100644 --- a/src/main/scala/com/gravity/goose/Crawler.scala +++ b/src/main/scala/com/gravity/goose/Crawler.scala @@ -67,6 +67,7 @@ class Crawler(config: Configuration) { article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.tags = extractor.extractTags(article) + article.openGraphData = config.getOpenGraphDataExtractor.extract(doc) // before we do any calcs on the body itself let's clean up the document article.doc = docCleaner.clean(article) diff --git a/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala new file mode 100644 index 000000000..77b8dc7e1 --- /dev/null +++ b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala @@ -0,0 +1,50 @@ +/** +Copyright [2014] Robby Pond + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.gravity.goose.extractors + +import org.jsoup.nodes.Element + +import scala.collection.JavaConversions._ +import com.gravity.goose.opengraph.OpenGraphData + +class OpenGraphDataExtractor extends Extractor[OpenGraphData] { + + def extract(rootElement: Element): OpenGraphData = { + val openGraphData: OpenGraphData = new OpenGraphData + val elements : scala.collection.mutable.Buffer[Element] = rootElement.select("meta") + for(el <- elements) { + if(el.attr("property") == "og:title") + openGraphData.title = el.attr("content") + if(el.attr("property") == "og:site_name") + openGraphData.siteName = el.attr("content") + if(el.attr("property") == "og:url") + openGraphData.url = el.attr("content") + if(el.attr("property") == "og:description") + openGraphData.description = el.attr("content") + if(el.attr("property") == "og:image") + openGraphData.image = el.attr("content") + if(el.attr("property") == "og:type") + openGraphData.ogType = el.attr("content") + if(el.attr("property") == "og:locale") + openGraphData.locale = el.attr("content") + if(el.attr("property") == "article:author") + openGraphData.author = el.attr("content") + if(el.attr("property") == "article:publisher") + openGraphData.publisher = el.attr("content") + } + openGraphData + } +} \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala new file mode 100644 index 000000000..0e9d8cfe6 --- /dev/null +++ b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala @@ -0,0 +1,29 @@ +/** +Copyright [2014] Robby Pond + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.gravity.goose.opengraph; + +class OpenGraphData { + + var title: String = "" + var siteName: String = "" + var url: String = "" + var description: String = "" + var image: String = "" + var ogType: String = "" + var locale: String = "en_US" + var author: String = "" + var publisher: String = "" +} diff --git a/src/test/scala/com/gravity/goose/OpenGraphTest.scala b/src/test/scala/com/gravity/goose/OpenGraphTest.scala new file mode 100644 index 000000000..9fe20250e --- /dev/null +++ b/src/test/scala/com/gravity/goose/OpenGraphTest.scala @@ -0,0 +1,32 @@ +package com.gravity.goose + +import org.junit.Test +import org.junit.Assert._ + +class OpenGraphTest { + + @Test + def openGraph() { + implicit val config = TestUtils.NO_IMAGE_CONFIG + // og tags for http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html + /* + + + + + + */ + val url: String = "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html" + val article = TestUtils.getArticle(url) + assertEquals("og:description was not as expected!", article.openGraphData.description, + "A 'world's hottest chilli' competition at a curry restaurant left two people in hospital.") + assertEquals("og:title was not as expected!", article.openGraphData.title, + "World's hottest chilli contest leaves two in hospital - Telegraph") + assertEquals("og:url was not as expected!", article.openGraphData.url, + "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html") + assertEquals("og:image was not as expected!", article.openGraphData.image, + "http://i.telegraph.co.uk/multimedia/archive/02018/Kismot-Killer_2018476a.jpg") + assertEquals("og:type was not as expected!", article.openGraphData.ogType, + "article") + } +} \ No newline at end of file