forked from mit-nlp/MITIE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_categorizer_pure_model.py
executable file
·50 lines (39 loc) · 2.13 KB
/
text_categorizer_pure_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python
#
# This example shows how to use the MITIE Python API to train a
# text_categorizer that produces smaller model files. However, these
# smaller files are dependent on some feature extractor file which must be
# provided when the model is loaded, as you will see in this example.
#
# For a comparison of what a non-pure version of the API looks like you can
# read the categorize_text.py and train_text_categorizer.py examples.
#
#
import sys, os
# Make sure you put the mitielib folder into the python search path. There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent + '/../../mitielib')
from mitie import *
fe_filename= "../../MITIE-models/english/total_word_feature_extractor.dat"
trainer = text_categorizer_trainer(fe_filename)
# Don't forget to add the training data. Here we have only two examples, but for real
# uses you need to have thousands. You could also pass whole sentences in to the tokenize() function
# to get the tokens.
trainer.add_labeled_text(["I","am","so","happy","and","exciting","to","make","this"],"positive")
trainer.add_labeled_text(["What","a","black","and","bad","day"],"negative")
# The trainer can take advantage of a multi-core CPU. So set the number of threads
# equal to the number of processing cores for maximum training speed.
trainer.num_threads = 4
# This function does the work of training. Note that it can take a long time to run
# when using larger training datasets. So be patient.
cat = trainer.train()
# Now that training is done we can save the categorizer object to disk like so.
# In pure_model mode we do not include a copy of the feature extractor.
cat.save_to_disk("new_text_categorizer_pure_model.dat",pure_model=True)
# Now to load load the 'pure model' from disk we also pass the feature extractor filename
cat2 = text_categorizer("new_text_categorizer_pure_model.dat",fe_filename)
text1 = "I am so happy"
pred, conf = cat2(tokenize(text1))
print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text1,pred,conf))