Skip to content

Commit 7a0ae01

Browse files
committed
first commit
1 parent 0b75724 commit 7a0ae01

File tree

14 files changed

+36
-126
lines changed

14 files changed

+36
-126
lines changed

.idea/workspace.xml

+9-101
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/main/scala/com/.DS_Store

0 Bytes
Binary file not shown.

src/main/scala/com/sparkProject/Trainer.scala

+12-11
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ object Trainer {
3131
.appName("TP_spark")
3232
.getOrCreate()
3333

34-
// only for self made function
34+
// only for self made functions
3535
// import spark.implicits._
3636

3737
/*******************************************************************************
@@ -45,6 +45,8 @@ object Trainer {
4545
*
4646
* if problems with unimported modules => sbt plugins update
4747
*
48+
* ALEXANDRE ROUXEL
49+
*
4850
********************************************************************************/
4951

5052
/** CHARGER LE DATASET **/
@@ -62,7 +64,6 @@ object Trainer {
6264
.setInputCol("text")
6365
.setOutputCol("tokens")
6466

65-
//StopWordsRemover
6667
// stage 2
6768
val remover = new StopWordsRemover()
6869
.setInputCol("tokens")
@@ -86,12 +87,12 @@ object Trainer {
8687
// stage 5
8788

8889
// stream indexer
89-
val indexer = new StringIndexer()
90+
val indexerCountry = new StringIndexer()
9091
.setInputCol("country2")
9192
.setOutputCol("country_indexed")
9293

9394
// stage 6
94-
val indexer2 = new StringIndexer()
95+
val indexerCurrency = new StringIndexer()
9596
.setInputCol("currency2")
9697
.setOutputCol("currency_indexed")
9798

@@ -119,11 +120,11 @@ object Trainer {
119120
/** PIPELINE **/
120121

121122
val pipeline = new Pipeline()
122-
.setStages(Array ( tokenizer , remover , countvectorizer , idf , indexer , indexer2 , vecAssembler ,lr))
123+
.setStages(Array ( tokenizer , remover , countvectorizer , idf , indexerCountry , indexerCurrency , vecAssembler ,lr))
123124

124125
/** TRAINING AND GRID-SEARCH **/
125126

126-
/** build a training set **/
127+
/** build a training set (90 % , 10 %) for (training , testing) **/
127128
val Array(training, test) = df.randomSplit(Array(0.9, 0.1), seed = 12345)
128129

129130

@@ -143,17 +144,17 @@ object Trainer {
143144
.setPredictionCol("predictions")
144145
.setMetricName("f1")
145146

146-
147+
/** define the cross validation with F1 measure and 70% of the data for training */
147148
val cv = new TrainValidationSplit()
148149
.setEstimator(pipeline)
149150
.setEvaluator(evaluatorF1)
150151
.setEstimatorParamMaps(paramGrid)
151152
.setTrainRatio(0.7)
152153

153-
/** run the cross validator on the training set **/
154+
/** fit the best output of the cross validator on the training set **/
154155
val cvModel = cv.fit(training)
155156

156-
/** run the cross validator on test and training set **/
157+
/** transform the test set and the training set with the best transformer found by cross validation **/
157158
val trainPredictions = cvModel.transform(training)
158159
val testPredictions = cvModel.transform(test)
159160

@@ -168,9 +169,9 @@ object Trainer {
168169

169170
df_WithPredictions.groupBy( "final_status" , "predictions" ).count.show()
170171

171-
println("F1 measurement on training set ")
172+
println("F1 measurement on training set : ")
172173
println(f1Train)
173-
println("F1 measurement on test set ")
174+
println("F1 measurement on test set : ")
174175
println(f1Test)
175176

176177
/** save the trained model **/
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

target/streams/$global/assembly/$global/streams/out

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[debug] Calculate mappings...
22
[info] Checking every *.class/*.jar file's SHA-1.
33
[info] Merging files...
4-
[info] SHA-1: ce0b476c860eaf6c622567df87af3ea87eefec4e
4+
[info] SHA-1: 4959e7de8eb5805b66f88853e69632f7c461c412
55
[info] Packaging /Users/alexandre/MSBGD/spark/tp/TP_ParisTech_2017_2018_starter/target/scala-2.11/TP_ParisTech_2017_2018-assembly-1.0.jar ...
66
[debug] Input file mappings:
77
[debug]  com/sparkProject/Exercice$.class

target/streams/compile/compileIncremental/$global/streams/out

+2-2
Large diffs are not rendered by default.

target/streams/compile/incCompileSetup/$global/streams/inc_compile_2.11

+12-11
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)