-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTweetParser.java
194 lines (165 loc) · 6.3 KB
/
TweetParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/*
* Handles I/O with tweets expressed as JSON files.
*
* Copyright (C) 2013 Lisa Vitolo <[email protected]>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the Creative Commons
* Attribution-NonCommercial-ShareAlike 3.0 license.
* You should have received a copy of the license with this product.
* Otherwise, visit http://creativecommons.org/licenses/by-nc-sa/3.0/
*/
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.List;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.PrintStream;
import java.io.StringReader;
public class TweetParser
{
private String inputFile;
private String outputFile;
private Object inputJSON;
private List<SentencePair> sentencePairs;
private StanfordCoreNLP nlp;
public TweetParser(String inputFile, String outputFile, StanfordCoreNLP nlp)
{
this.inputFile = inputFile;
this.outputFile = outputFile;
this.nlp = nlp;
}
/*
* Creates sentence pairs from the input tweet. Each sentence pair contains the main question
* and one of the questions in the "qpairs" array.
*/
public void parse()
{
JSONParser json = new JSONParser();
this.sentencePairs = new LinkedList<>();
try {
Object obj;
if (inputFile == null) {
obj = json.parse( new StringReader( readJSONFromStdin() ));
} else {
obj = json.parse( new FileReader(inputFile) );
}
JSONObject jsonObject = (JSONObject)obj;
List<String> tweetPOSTags = new LinkedList<>();
List<String> tweetNormalizedTokens = new LinkedList<>();
List<String> tweetLemmas = new LinkedList<>();
JSONArray tweetTokens = (JSONArray)jsonObject.get("tokens");
/*
* For the user question, gets all the needed information at this stage,
* without having to use external tools. Normalized tokens are used to reduce the
* negative effects of misspelled words.
*/
for (Object t : tweetTokens) {
JSONObject token = (JSONObject)t;
tweetNormalizedTokens.add( (String)token.get("normalized") );
tweetPOSTags.add( (String)token.get("pos") );
tweetLemmas.add( (String)token.get("lemma") );
}
JSONArray questions = (JSONArray)jsonObject.get("qpairs");
for (Object p : questions) {
JSONObject qaPair = (JSONObject)p;
String question = (String)qaPair.get("question");
sentencePairs.add( new SentencePair(tweetNormalizedTokens, tweetPOSTags, tweetLemmas, question, nlp) );
}
this.inputJSON = obj;
} catch (IOException e) {
System.err.println(":: IO Error: " + e);
System.exit(-1);
} catch (ParseException e) { /* malformed JSON file */
System.err.println(":: Parse error: " + e);
System.exit(-1);
}
}
/*
* Writes similarity scores in the output JSON file. We add a new field "similarity" in each
* "qpairs" object with the score.
*/
public void writeSimilarities(double[] similarities)
{
try {
Object obj = this.inputJSON;
JSONObject jsonObject = (JSONObject)obj;
JSONArray questions = (JSONArray)jsonObject.get("qpairs");
String tweet = (String)jsonObject.get("text");
int index = 0;
for (Object p : questions) {
JSONObject qaPair = (JSONObject)p;
qaPair.put("similarity", similarities[index++]);
}
PrintStream writer;
if (outputFile != null) {
writer = new PrintStream(outputFile);
} else {
writer = System.out;
}
/*
* By default, the JSON string is compressed in one line, so I add some newlines for
* readability. The resulting string is still a valid JSON object.
*/
writer.println( goodJSONFormat(jsonObject.toJSONString()) );
writer.close();
} catch (IOException e) {
System.err.println(":: Error reading from JSON file: " + e.getLocalizedMessage());
}
}
public List<SentencePair> getSentencePairs()
{
return sentencePairs;
}
private String goodJSONFormat(String jsonOutput)
{
char[] str = jsonOutput.toCharArray();
String formatted = "";
for (int i = 0; i < str.length; i++) {
char ch = str[i];
formatted += ch;
if (ch == ',') {
formatted += "\n";
}
}
return formatted;
}
/*
* When no input file is supplied, reads a JSON string from stdin. The JSON object
* terminates when all the open parentheses "{" have been closed and an empty line is
* inserted.
*/
private static String readJSONFromStdin()
{
String content = "";
String tmp;
int parenCounter = 0;
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
while (true) {
tmp = reader.readLine();
content += tmp + "\n";
if (tmp.contains("{")) {
parenCounter++;
}
if (tmp.contains("}")) {
parenCounter--;
if (parenCounter == 0) {
content += "\n";
break;
}
}
}
} catch (IOException ex) {
System.err.println(":: Error reading from stdin: " + ex.getMessage());
System.exit(-1);
}
return content;
}
}