Skip to content

Commit faac228

Browse files
authored
SDAP-71 full ingestion workflow and process broken in several places (#16)
1 parent c2dce18 commit faac228

21 files changed

+407
-318
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ core/.externalToolBuilders/Maven_Ant_Builder.launch
1515
core/maven-eclipse.xml
1616
service/.classpath
1717
web/.classpath
18+
web/.externalToolBuilders/
19+
web/maven-eclipse.xml

core/src/main/java/org/apache/sdap/mudrod/driver/ESDriver.java

+21-5
Original file line numberDiff line numberDiff line change
@@ -177,15 +177,31 @@ public List<String> customAnalyzing(String indexName, List<String> list) throws
177177
}
178178

179179
public void deleteAllByQuery(String index, String type, QueryBuilder query) {
180-
ImmutableOpenMap<String, MappingMetaData> mappings = getClient().admin().cluster().prepareState().execute().actionGet()
181-
.getState().metaData().index(index).getMappings();
180+
ImmutableOpenMap<String, MappingMetaData> mappings = getClient()
181+
.admin()
182+
.cluster()
183+
.prepareState()
184+
.execute()
185+
.actionGet()
186+
.getState()
187+
.metaData()
188+
.index(index)
189+
.getMappings();
182190

183191
//check if the type exists
184-
if (!mappings.containsKey(type)) return;
192+
if (!mappings.containsKey(type))
193+
return;
185194

186195
createBulkProcessor();
187-
SearchResponse scrollResp = getClient().prepareSearch(index).setSearchType(SearchType.QUERY_AND_FETCH).setTypes(type).setScroll(new TimeValue(60000)).setQuery(query).setSize(10000).execute()
188-
.actionGet();
196+
SearchResponse scrollResp = getClient()
197+
.prepareSearch(index)
198+
.setSearchType(SearchType.QUERY_AND_FETCH)
199+
.setTypes(type)
200+
.setScroll(new TimeValue(60000))
201+
.setQuery(query)
202+
.setSize(10000)
203+
.execute()
204+
.actionGet();
189205

190206
while (true) {
191207
for (SearchHit hit : scrollResp.getHits().getHits()) {

core/src/main/java/org/apache/sdap/mudrod/integration/LinkageIntegration.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ public JsonObject getIngeratedListInJson(String input) {
171171
* the similarities from different sources
172172
*/
173173
public Map<String, List<LinkedTerm>> aggregateRelatedTermsFromAllmodel(String input) {
174-
aggregateRelatedTerms(input, MudrodConstants.USE_HISTORY_LINKAGE_TYPE);
174+
aggregateRelatedTerms(input, MudrodConstants.USER_HISTORY_LINKAGE_TYPE);
175175
aggregateRelatedTerms(input, MudrodConstants.CLICK_STREAM_LINKAGE_TYPE);
176176
aggregateRelatedTerms(input, MudrodConstants.METADATA_LINKAGE_TYPE);
177177
aggregateRelatedTermsSWEET(input, MudrodConstants.ONTOLOGY_LINKAGE_TYPE);
@@ -180,7 +180,7 @@ public Map<String, List<LinkedTerm>> aggregateRelatedTermsFromAllmodel(String in
180180
}
181181

182182
public int getModelweight(String model) {
183-
if (model.equals(MudrodConstants.USE_HISTORY_LINKAGE_TYPE)) {
183+
if (model.equals(MudrodConstants.USER_HISTORY_LINKAGE_TYPE)) {
184184
return Integer.parseInt(props.getProperty(MudrodConstants.USER_HISTORY_W));
185185
}
186186

core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java

+33-37
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,22 @@
1515

1616
/**
1717
* Class contains static constant keys and values relating to Mudrod
18-
* configuration properties. Property values are read from <a href=
19-
* "https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml">config.xml</a>
18+
* configuration properties. Property values are read from <code>config.properties</code>.
2019
*/
2120
public interface MudrodConstants {
2221

2322
public static final String CLEANUP_TYPE = "cleanup.log";
2423

25-
public static final String CLICK_STREAM_LINKAGE_TYPE = "click.stream.linkage";
24+
public static final String CLICK_STREAM_LINKAGE_TYPE = "mudrod.clickstream.linkage";
2625

27-
public static final String CLICK_STREAM_MATRIX_TYPE = "click.stream.matrix";
26+
public static final String CLICK_STREAM_MATRIX_TYPE = "mudrod.clickstream.matrix";
2827

2928
public static final String CLICKSTREAM_SVD_DIM = "mudrod.clickstream.svd.d";
3029

3130
public static final String CLICKSTREAM_W = "mudrod.clickstream.weight";
32-
31+
3332
public static final String CLICKSTREAM_PATH = "mudrod.clickstream.path";
34-
33+
3534
public static final String CLICKSTREAM_SVD_PATH = "mudrod.clickstream.svd.path";
3635

3736
/** Defined on CLI */
@@ -52,25 +51,25 @@ public interface MudrodConstants {
5251
public static final String FTP_PREFIX = "mudrod.ftp.prefix";
5352

5453
public static final String FTP_TYPE = "raw.ftp";
55-
54+
5655
public static final String FTP_LOG = "ftp";
5756

5857
public static final String HTTP_PREFIX = "mudrod.http.prefix";
5958

6059
public static final String HTTP_TYPE = "raw.http";
61-
60+
6261
public static final String HTTP_LOG = "http";
63-
62+
6463
public static final String BASE_URL = "mudrod.base.url";
65-
64+
6665
public static final String BLACK_LIST_REQUEST = "mudrod.black.request.list";
67-
66+
6867
public static final String BLACK_LIST_AGENT = "mudrod.black.agent.list";
6968

7069
public static final String LOG_INDEX = "mudrod.log.index";
7170

7271
public static final String METADATA_LINKAGE_TYPE = "metadata.linkage";
73-
72+
7473
public static final String METADATA_DOWNLOAD_URL = "mudrod.metadata.download.url";
7574

7675
public static final String METADATA_SVD_DIM = "mudrod.metadata.svd.d";
@@ -93,38 +92,38 @@ public interface MudrodConstants {
9392
public static final String ONTOLOGY_LINKAGE_TYPE = "ontology.linkage";
9493

9594
public static final String ONTOLOGY_W = "mudrod.ontology.weight";
96-
95+
9796
public static final String ONTOLOGY_PATH = "mudrod.ontology.path";
98-
97+
9998
public static final String ONTOLOGY_INPUT_PATH = "mudrod.ontology.input.path";
10099

101100
/** Defined on CLI */
102101
public static final String METADATA_DOWNLOAD = "mudrod.metadata.download";
103-
102+
104103
public static final String RAW_METADATA_PATH = "mudrod.metadata.path";
105104

106105
public static final String RAW_METADATA_TYPE = "mudrod.metadata.type";
107-
106+
108107
public static final String METADATA_MATRIX_PATH = "mudrod.metadata.matrix.path";
109-
108+
110109
public static final String METADATA_SVD_PATH = "mudrod.metadata.svd.path";
111-
110+
112111
public static final String RECOM_METADATA_TYPE = "recommedation.metadata";
113-
112+
114113
public static final String METADATA_ID = "mudrod.metadata.id";
115-
114+
116115
public static final String SEMANTIC_FIELDS = "mudrod.metadata.semantic.fields";
117-
116+
118117
public static final String METADATA_WORD_SIM_TYPE = "metadata.word.sim";
119-
118+
120119
public static final String METADATA_FEATURE_SIM_TYPE = "metadata.feature.sim";
121-
120+
122121
public static final String METADATA_SESSION_SIM_TYPE = "metadata.session.sim";
123-
122+
124123
public static final String METADATA_TERM_MATRIX_PATH = "metadata.term.matrix.path";
125-
124+
126125
public static final String METADATA_WORD_MATRIX_PATH = "metadata.word.matrix.path";
127-
126+
128127
public static final String METADATA_SESSION_MATRIX_PATH = "metadata.session.matrix.path";
129128

130129
public static final String REQUEST_RATE = "mudrod.request.rate";
@@ -138,32 +137,29 @@ public interface MudrodConstants {
138137
public static final String SPARK_APP_NAME = "mudrod.spark.app.name";
139138

140139
public static final String SPARK_MASTER = "mudrod.spark.master";
141-
/**
142-
* Absolute local location of javaSVMWithSGDModel directory. This is typically
143-
* <code>file:///usr/local/mudrod/core/src/main/resources/javaSVMWithSGDModel</code>
144-
*/
140+
145141
public static final String RANKING_MODEL = "mudrod.ranking.model";
146-
142+
147143
public static final String RANKING_ML = "mudrod.ranking.machine.learning";
148144

149145
public static final String REQUEST_TIME_GAP = "mudrod.request.time.gap";
150146

151147
public static final String TIME_SUFFIX = "time.suffix";
152148

153-
public static final String USE_HISTORY_LINKAGE_TYPE = "user.history.linkage";
149+
public static final String USER_HISTORY_LINKAGE_TYPE = "mudrod.user.history.linkage";
154150

155151
public static final String USER_HISTORY_W = "mudrod.user.history.weight";
156-
152+
157153
public static final String USER_HISTORY_PATH = "mudrod.user.history.path";
158154

159155
public static final String VIEW_F = "mudrod.view.freq";
160-
156+
161157
public static final String VIEW_MARKER = "mudrod.view.url.marker";
162-
158+
163159
public static final String SEARCH_MARKER = "mudrod.search.url.marker";
164-
160+
165161
public static final String SEARCH_F = "mudrod.search.freq";
166-
162+
167163
public static final String DOWNLOAD_F = "mudrod.download.freq";
168164

169165
}

core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java

+8-8
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import java.util.Properties;
3333

3434
/**
35-
* ClassName: ApiHarvester Function: Harvest metadata from PO.DAACweb service.
35+
* Harvest metadata from PO.DAAC Webservices.
3636
*/
3737
public class ApiHarvester extends DiscoveryStepAbstract {
3838

@@ -70,7 +70,7 @@ public Object execute() {
7070
}
7171

7272
/**
73-
* addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please
73+
* Add mapping to index metadata in Elasticsearch. Please
7474
* invoke this method before import metadata to Elasticsearch.
7575
*/
7676
public void addMetadataMapping() {
@@ -84,7 +84,7 @@ public void addMetadataMapping() {
8484
}
8585

8686
/**
87-
* importToES: Index metadata into elasticsearch from local file directory.
87+
* Index metadata into elasticsearch from local file directory.
8888
* Please make sure metadata have been harvest from web service before
8989
* invoking this method.
9090
*/
@@ -118,12 +118,12 @@ private void importSingleFileToES(InputStream is) {
118118
}
119119

120120
/**
121-
* harvestMetadatafromWeb: Harvest metadata from PO.DAAC web service.
121+
* Harvest metadata from PO.DAAC web service.
122122
*/
123123
private void harvestMetadatafromWeb() {
124124
LOG.info("Metadata download started.");
125125
int startIndex = 0;
126-
int doc_length = 0;
126+
int docLength = 0;
127127
JsonParser parser = new JsonParser();
128128
do {
129129
String searchAPI = props.getProperty(MudrodConstants.METADATA_DOWNLOAD_URL);
@@ -135,7 +135,7 @@ private void harvestMetadatafromWeb() {
135135
JsonObject responseObject = json.getAsJsonObject();
136136
JsonArray docs = responseObject.getAsJsonObject("response").getAsJsonArray("docs");
137137

138-
doc_length = docs.size();
138+
docLength = docs.size();
139139

140140
File file = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH));
141141
if (!file.exists()) {
@@ -145,7 +145,7 @@ private void harvestMetadatafromWeb() {
145145
LOG.error("Failed to create directory!");
146146
}
147147
}
148-
for (int i = 0; i < doc_length; i++) {
148+
for (int i = 0; i < docLength; i++) {
149149
JsonElement item = docs.get(i);
150150
int docId = startIndex + i;
151151
File itemfile = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH) + "/" + docId + ".json");
@@ -167,7 +167,7 @@ private void harvestMetadatafromWeb() {
167167
Thread.currentThread().interrupt();
168168
}
169169

170-
} while (doc_length != 0);
170+
} while (docLength != 0);
171171

172172
LOG.info("Metadata downloading finished");
173173
}

core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,12 @@ public Object execute() {
6464
String metadataMatrixFile = props.getProperty(MudrodConstants.METADATA_MATRIX_PATH);
6565
try {
6666
MetadataExtractor extractor = new MetadataExtractor();
67-
JavaPairRDD<String, List<String>> metadataTermsRDD = extractor.loadMetadata(this.es, this.spark.sc, props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
67+
JavaPairRDD<String, List<String>> metadataTermsRDD =
68+
extractor.loadMetadata(
69+
this.es,
70+
this.spark.sc,
71+
props.getProperty(MudrodConstants.ES_INDEX_NAME),
72+
props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
6873
LabeledRowMatrix wordDocMatrix = MatrixUtil.createWordDocMatrix(metadataTermsRDD);
6974
MatrixUtil.exportToCSV(wordDocMatrix.rowMatrix, wordDocMatrix.rowkeys, wordDocMatrix.colkeys, metadataMatrixFile);
7075

core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@
2727
import java.util.Properties;
2828

2929
/**
30-
* ClassName: MetadataAnalyzer
31-
* Function: Calculate semantic relationship of vocabularies extracted from
30+
* Calculate semantic relationship of vocabularies extracted from
3231
* metadata.
3332
*/
3433
public class MetadataAnalyzer extends DiscoveryStepAbstract implements Serializable {

core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/MetadataTFIDFGenerator.java

+10-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,11 @@ public LabeledRowMatrix generateWordBasedTFIDF() throws Exception {
8181

8282
LabeledRowMatrix wordtfidfMatrix = opt.tFIDFTokens(metadataWords, spark);
8383

84-
MatrixUtil.exportToCSV(wordtfidfMatrix.rowMatrix, wordtfidfMatrix.rowkeys, wordtfidfMatrix.colkeys, props.getProperty(MudrodConstants.METADATA_WORD_MATRIX_PATH));
84+
MatrixUtil.exportToCSV(
85+
wordtfidfMatrix.rowMatrix,
86+
wordtfidfMatrix.rowkeys,
87+
wordtfidfMatrix.colkeys,
88+
props.getProperty(MudrodConstants.METADATA_WORD_MATRIX_PATH));
8589

8690
return wordtfidfMatrix;
8791
}
@@ -100,7 +104,11 @@ public LabeledRowMatrix generateTermBasedTFIDF() throws Exception {
100104

101105
LabeledRowMatrix tokentfidfMatrix = opt.tFIDFTokens(metadataTokens, spark);
102106

103-
MatrixUtil.exportToCSV(tokentfidfMatrix.rowMatrix, tokentfidfMatrix.rowkeys, tokentfidfMatrix.colkeys, props.getProperty(MudrodConstants.METADATA_TERM_MATRIX_PATH));
107+
MatrixUtil.exportToCSV(
108+
tokentfidfMatrix.rowMatrix,
109+
tokentfidfMatrix.rowkeys,
110+
tokentfidfMatrix.colkeys,
111+
props.getProperty(MudrodConstants.METADATA_TERM_MATRIX_PATH));
104112

105113
return tokentfidfMatrix;
106114
}

0 commit comments

Comments
 (0)