Skip to content

Commit 86145d9

Browse files
committed
fix for TIKA-1943 contributed by Mark Duske
Includes support for Yandex Translate API
1 parent f509917 commit 86145d9

File tree

1 file changed

+175
-0
lines changed

1 file changed

+175
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.tika.language.translate;
19+
20+
import javax.ws.rs.core.MediaType;
21+
import javax.ws.rs.core.Response;
22+
23+
import java.io.BufferedReader;
24+
import java.io.IOException;
25+
import java.io.InputStream;
26+
import java.io.InputStreamReader;
27+
import java.util.Properties;
28+
29+
import com.fasterxml.jackson.core.JsonParseException;
30+
import com.fasterxml.jackson.databind.JsonNode;
31+
import com.fasterxml.jackson.databind.ObjectMapper;
32+
import org.apache.cxf.jaxrs.client.WebClient;
33+
import org.apache.tika.exception.TikaException;
34+
import org.apache.tika.language.translate.Translator;
35+
36+
import static java.nio.charset.StandardCharsets.UTF_8;
37+
38+
/**
39+
* An implementation of a REST client for the YANDEX <a href="https://tech.yandex.com/translate/">Translate API</a>.
40+
* You can sign up for free access online on the <a href="https://tech.yandex.com/key/form.xml?service=trnsl">API Key form</a>
41+
* and set your Application's User Key in the <code>translator.yandex.properties</code> file.
42+
*/
43+
public class YandexTranslator implements Translator {
44+
45+
/**
46+
* Yandex Translate API service end-point URL
47+
*/
48+
private static final String YANDEX_TRANSLATE_URL_BASE = "https://translate.yandex.net/api/v1.5/tr.json/translate";
49+
50+
/**
51+
* Default USer-Key, a real User-Key must be provided before the Lingo24 can successfully request translations
52+
*/
53+
private static final String DEFAULT_KEY = "dummy-key";
54+
55+
/**
56+
* Identifies the client of the request, used for authentication
57+
*/
58+
private String apiKey;
59+
60+
/**
61+
* The Yandex Translate API can handle text in <b>plain</b> and/or <b>html</b> format, the default
62+
* format is <b>plain</b>
63+
*/
64+
private String format = "plain";
65+
66+
public YandexTranslator() {
67+
Properties config = new Properties();
68+
try {
69+
config.load(YandexTranslator.class
70+
.getResourceAsStream(
71+
"translator.yandex.properties"));
72+
this.apiKey = config.getProperty("translator.api-key");
73+
this.format = config.getProperty("translator.text.format");
74+
} catch (Exception e) {
75+
e.printStackTrace();
76+
}
77+
}
78+
79+
@Override
80+
public String translate(String text, String sourceLanguage,
81+
String targetLanguage) throws TikaException, IOException {
82+
if (!this.isAvailable()) {
83+
return text;
84+
}
85+
86+
WebClient client = WebClient.create(YANDEX_TRANSLATE_URL_BASE);
87+
88+
String langCode;
89+
90+
if (sourceLanguage == null) {
91+
//Translate Service will identify source language
92+
langCode = targetLanguage;
93+
} else {
94+
//Source language is well known
95+
langCode = sourceLanguage + '-' + targetLanguage;
96+
}
97+
98+
//TODO Add support for text over 10k characters
99+
Response response = client.accept(MediaType.APPLICATION_JSON)
100+
.query("key", this.apiKey).query("lang", langCode)
101+
.query("text", text).get();
102+
BufferedReader reader = new BufferedReader(new InputStreamReader(
103+
(InputStream) response.getEntity(), UTF_8));
104+
String line = null;
105+
StringBuffer responseText = new StringBuffer();
106+
while ((line = reader.readLine()) != null) {
107+
responseText.append(line);
108+
}
109+
110+
try {
111+
ObjectMapper mapper = new ObjectMapper();
112+
JsonNode jsonResp = mapper.readTree(responseText.toString());
113+
114+
if (!jsonResp.findValuesAsText("code").isEmpty()) {
115+
String code = jsonResp.findValuesAsText("code").get(0);
116+
if (code.equals("200")) {
117+
return jsonResp.findValue("text").get(0).asText();
118+
} else {
119+
throw new TikaException(jsonResp.findValue("message").get(0).asText());
120+
}
121+
} else {
122+
throw new TikaException("Return message not recognized: " + responseText.toString().substring(0, Math.min(responseText.length(), 100)));
123+
}
124+
} catch (JsonParseException e) {
125+
throw new TikaException("Error requesting translation from '" + sourceLanguage + "' to '" + targetLanguage + "', JSON response from Lingo24 is not well formatted: " + responseText.toString());
126+
}
127+
}
128+
129+
130+
/**
131+
* Get the API Key in use for client authentication
132+
* @return API Key
133+
*/
134+
public String getApiKey() {
135+
return apiKey;
136+
}
137+
138+
/**
139+
* Set the API Key for client authentication
140+
* @param apiKey API Key
141+
*/
142+
public void setApiKey(String apiKey) {
143+
this.apiKey = apiKey;
144+
}
145+
146+
/**
147+
* Retrieve the current text format setting.
148+
* The Yandex Translate API can handle text in <b>plain</b> and/or <b>html</b> format, the default
149+
* format is <b>plain</b>
150+
* @return
151+
*/
152+
public String getFormat() {
153+
return format;
154+
}
155+
156+
/**
157+
* Set the text format to use (plain/html)
158+
* @param format Text format setting, either plain or html
159+
*/
160+
public void setFormat(String format) {
161+
this.format = format;
162+
}
163+
164+
@Override
165+
public String translate(String text, String targetLanguage)
166+
throws TikaException, IOException {
167+
return this.translate(text, null, targetLanguage);
168+
}
169+
170+
@Override
171+
public boolean isAvailable() {
172+
return this.apiKey!=null && !this.apiKey.equals(DEFAULT_KEY);
173+
}
174+
175+
}

0 commit comments

Comments
 (0)