Skip to content

Commit d685742

Browse files
committed
Added NLTK NER
1 parent 6a09233 commit d685742

File tree

4 files changed

+210
-1
lines changed

4 files changed

+210
-1
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ target
99
*.iws
1010
*.bin
1111
nbactions.xml
12-
nb-configuration.xml
12+
nb-configuration.xml
13+
*.DS_Store

tika-parsers/pom.xml

+7
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@
8888
<version>2.1.1</version>
8989
</dependency>
9090

91+
92+
<!-- manali added this-->
93+
<dependency>
94+
<groupId>org.apache.httpcomponents</groupId>
95+
<artifactId>httpclient</artifactId>
96+
<version>4.5.1</version>
97+
</dependency>
9198
<!-- Optional OSGi dependencies, used only when running within OSGi -->
9299
<dependency>
93100
<groupId>org.apache.felix</groupId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.parser.ner.nltk;
18+
19+
import org.apache.http.client.methods.HttpGet;
20+
import org.apache.tika.parser.ner.NERecogniser;
21+
import org.json.simple.JSONArray;
22+
import org.json.simple.JSONObject;
23+
import org.json.simple.parser.JSONParser;
24+
import org.slf4j.Logger;
25+
import org.slf4j.LoggerFactory;
26+
27+
import java.util.HashMap;
28+
import java.util.HashSet;
29+
import java.util.List;
30+
import java.util.Map;
31+
import java.util.Set;
32+
33+
import java.io.BufferedReader;
34+
import java.io.InputStreamReader;
35+
import java.util.ArrayList;
36+
import org.apache.http.HttpResponse;
37+
import org.apache.http.NameValuePair;
38+
import org.apache.http.client.HttpClient;
39+
import org.apache.http.client.entity.UrlEncodedFormEntity;
40+
import org.apache.http.client.methods.HttpPost;
41+
import org.apache.http.impl.client.HttpClientBuilder;
42+
import org.apache.http.message.BasicNameValuePair;
43+
44+
45+
/**
46+
* This class offers an implementation of {@link NERecogniser} based on
47+
* CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
48+
* due to runtime binding to Stanford CoreNLP.
49+
* See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK">
50+
* Tika NER Wiki</a> for configuring this recogniser.
51+
* @see NERecogniser
52+
*
53+
*/
54+
public class NLTKNERecogniser implements NERecogniser {
55+
56+
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
57+
private final static String USER_AGENT = "Mozilla/5.0";
58+
private static boolean available = false;
59+
public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
60+
add(PERSON);
61+
add(TIME);
62+
add(LOCATION);
63+
add(ORGANIZATION);
64+
add(MONEY);
65+
add(PERCENT);
66+
add(DATE);
67+
add(FACILITY);
68+
add(GPE);
69+
}};
70+
71+
public NLTKNERecogniser(){
72+
try {
73+
74+
String url = "http://localhost:5000/";
75+
HttpClient client = HttpClientBuilder.create().build();
76+
HttpGet get = new HttpGet(url);
77+
78+
// add header
79+
get.setHeader("User-Agent", USER_AGENT);
80+
HttpResponse response = client.execute(get);
81+
int responseCode = response.getStatusLine().getStatusCode();
82+
if(responseCode == 200){
83+
available = true;
84+
}
85+
else{
86+
LOG.info("NLTKRest Server is not running");
87+
}
88+
89+
} catch (Exception e) {
90+
LOG.debug(e.getMessage(), e);
91+
}
92+
}
93+
94+
95+
/**
96+
*
97+
* @return {@code true} if model was available, valid and was able to initialise the classifier.
98+
* returns {@code false} when this recogniser is not available for service.
99+
*/
100+
public boolean isAvailable() {
101+
return available;
102+
}
103+
104+
/**
105+
* Gets set of entity types recognised by this recogniser
106+
* @return set of entity classes/types
107+
*/
108+
public Set<String> getEntityTypes() {
109+
return ENTITY_TYPES;
110+
}
111+
112+
/**
113+
* recognises names of entities in the text
114+
* @param text text which possibly contains names
115+
* @return map of entity type -> set of names
116+
*/
117+
public Map<String, Set<String>> recognise(String text) {
118+
Map<String, Set<String>> entities = new HashMap<>();
119+
try {
120+
String url = "http://localhost:5000/nltk";
121+
HttpClient client = HttpClientBuilder.create().build();
122+
HttpPost post = new HttpPost(url);
123+
// add header
124+
post.setHeader("User-Agent", USER_AGENT);
125+
List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
126+
urlParameters.add(new BasicNameValuePair("text", text));
127+
post.setEntity(new UrlEncodedFormEntity(urlParameters));
128+
129+
HttpResponse response = client.execute(post);
130+
131+
int responseCode = response.getStatusLine().getStatusCode();
132+
if (responseCode == 200) {
133+
BufferedReader rd = new BufferedReader(
134+
new InputStreamReader(response.getEntity().getContent()));
135+
136+
String result = rd.readLine();
137+
138+
JSONParser parser = new JSONParser();
139+
JSONObject j = (JSONObject) parser.parse(result);
140+
JSONArray aa = new JSONArray();
141+
for (Object x : j.keySet()) {
142+
aa = (JSONArray) j.get(x.toString());
143+
Set s = new HashSet();
144+
for (Object y : aa) {
145+
s.add(y.toString());
146+
}
147+
entities.put(x.toString(), s);
148+
}
149+
}
150+
}
151+
catch (Exception e) {
152+
LOG.debug(e.getMessage(), e);
153+
}
154+
ENTITY_TYPES.clear();
155+
ENTITY_TYPES.addAll(entities.keySet());
156+
LOG.info("returning this:" + entities.keySet().toString());
157+
return entities;
158+
}
159+
160+
161+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package org.apache.tika.parser.ner.nltk;
2+
3+
/**
4+
* Created by manali on 2/1/16.
5+
*/
6+
import org.apache.commons.logging.Log;
7+
import org.apache.tika.Tika;
8+
import org.apache.tika.config.TikaConfig;
9+
import org.apache.tika.metadata.Metadata;
10+
import org.apache.tika.parser.ner.NamedEntityParser;
11+
import org.junit.Test;
12+
13+
import java.io.ByteArrayInputStream;
14+
import java.nio.charset.StandardCharsets;
15+
import java.util.Arrays;
16+
import java.util.HashSet;
17+
import java.util.Set;
18+
19+
import static org.junit.Assert.assertTrue;
20+
21+
public class NLTKNERecogniserTest {
22+
@Test
23+
public void testGetEntityTypes() throws Exception {
24+
25+
String text = "America";
26+
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
27+
28+
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
29+
Metadata md = new Metadata();
30+
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
31+
32+
33+
Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
34+
if(gpe.size() == 0) return;
35+
else {
36+
assertTrue(gpe.contains("America"));
37+
assertTrue(gpe.size() == 1); //and nothing else
38+
}
39+
}
40+
}

0 commit comments

Comments
 (0)