Skip to content

Commit 79dc8db

Browse files
authored
Add SentencePiece as a Git submodule (#167)
1 parent 757f04f commit 79dc8db

File tree

11 files changed

+24
-130
lines changed

11 files changed

+24
-130
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "third_party/cxxopts"]
55
path = third_party/cxxopts
66
url = https://github.com/jarro2783/cxxopts.git
7+
[submodule "third_party/sentencepiece"]
8+
path = third_party/sentencepiece
9+
url = https://github.com/google/sentencepiece.git

.travis.yml

+2-23
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,12 @@ language: cpp
33
compiler:
44
- gcc
55
- clang
6-
env:
7-
global:
8-
- SENTENCEPIECE_VERSION="0.1.8"
9-
cache:
10-
directories:
11-
- $HOME/sentencepiece-$SENTENCEPIECE_VERSION/
12-
before_install:
13-
- export ROOT_TRAVIS_DIR=$(pwd)
14-
- |
15-
if [ ! -d $HOME/sentencepiece-$SENTENCEPIECE_VERSION/lib ]; then
16-
wget https://github.com/google/sentencepiece/archive/v$SENTENCEPIECE_VERSION.tar.gz
17-
tar xf v$SENTENCEPIECE_VERSION.tar.gz
18-
cd sentencepiece-$SENTENCEPIECE_VERSION
19-
mkdir build
20-
cd build
21-
cmake -DCMAKE_INSTALL_PREFIX=$HOME/sentencepiece-$SENTENCEPIECE_VERSION ..
22-
make
23-
make install
24-
cd $ROOT_TRAVIS_DIR
25-
fi
266
install:
277
- export TOKENIZER_ROOT=$HOME/Tokenizer
28-
- export SENTENCEPIECE_ROOT=$HOME/sentencepiece-$SENTENCEPIECE_VERSION
298
- mkdir build && cd build
30-
- cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT ..
9+
- cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT ..
3110
- make install
32-
- cd $ROOT_TRAVIS_DIR
11+
- cd ..
3312
script:
3413
- build/test/onmt_tokenizer_test test/data
3514

CMakeLists.txt

+14-38
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ if(NOT CMAKE_BUILD_TYPE)
1717
set(CMAKE_BUILD_TYPE Release)
1818
endif(NOT CMAKE_BUILD_TYPE)
1919

20+
option(SPM_ENABLE_SHARED "" OFF)
21+
option(SPM_ENABLE_TCMALLOC "" OFF)
22+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sentencepiece EXCLUDE_FROM_ALL)
23+
2024
if(MSVC)
2125
if(NOT BUILD_SHARED_LIBS)
2226
if(CMAKE_VERSION VERSION_LESS "3.15.0")
@@ -32,16 +36,22 @@ endif()
3236
find_package(ICU REQUIRED)
3337

3438
set(INCLUDE_DIRECTORIES
39+
PUBLIC
3540
${CMAKE_CURRENT_SOURCE_DIR}/include
3641
${PROJECT_BINARY_DIR}
42+
43+
PRIVATE
3744
${ICU_INCLUDE_DIRS}
45+
${CMAKE_CURRENT_SOURCE_DIR}/third_party/sentencepiece/src
3846
)
3947

4048
set(PUBLIC_HEADERS
4149
include/onmt/Token.h
4250
include/onmt/BPE.h
4351
include/onmt/BPELearner.h
4452
include/onmt/ITokenizer.h
53+
include/onmt/SPMLearner.h
54+
include/onmt/SentencePiece.h
4555
include/onmt/SpaceTokenizer.h
4656
include/onmt/SubwordEncoder.h
4757
include/onmt/SubwordLearner.h
@@ -53,6 +63,8 @@ set(SOURCES
5363
src/BPELearner.cc
5464
src/Casing.cc
5565
src/ITokenizer.cc
66+
src/SPMLearner.cc
67+
src/SentencePiece.cc
5668
src/SpaceTokenizer.cc
5769
src/SubwordEncoder.cc
5870
src/SubwordLearner.cc
@@ -64,46 +76,10 @@ set(SOURCES
6476

6577
list(APPEND LINK_LIBRARIES
6678
${ICU_LIBRARIES}
79+
sentencepiece-static
80+
sentencepiece_train-static
6781
)
6882

69-
find_library(SP_LIBRARY NAMES sentencepiece)
70-
find_path(SP_INCLUDE_DIR NAMES sentencepiece_processor.h)
71-
72-
if(NOT SP_LIBRARY OR NOT SP_INCLUDE_DIR)
73-
message(WARNING "sentencepiece not found; will not be supported")
74-
else()
75-
message(STATUS "Found sentencepiece: ${SP_LIBRARY}")
76-
add_definitions(-DWITH_SP)
77-
list(APPEND PUBLIC_HEADERS include/onmt/SentencePiece.h)
78-
list(APPEND SOURCES src/SentencePiece.cc)
79-
list(APPEND INCLUDE_DIRECTORIES ${SP_INCLUDE_DIR})
80-
list(APPEND LINK_LIBRARIES ${SP_LIBRARY})
81-
82-
find_library(SP_TRAIN_LIBRARY NAMES sentencepiece_train)
83-
if(SP_TRAIN_LIBRARY)
84-
find_package(Threads)
85-
add_definitions(-DWITH_SP_TRAIN)
86-
message(STATUS "Found sentencepiece_train: ${SP_TRAIN_LIBRARY}")
87-
list(APPEND PUBLIC_HEADERS include/onmt/SPMLearner.h)
88-
list(APPEND SOURCES src/SPMLearner.cc)
89-
list(APPEND LINK_LIBRARIES
90-
${SP_TRAIN_LIBRARY}
91-
${CMAKE_THREAD_LIBS_INIT})
92-
else()
93-
message(WARNING "sentencepiece_train not found: training SentencePiece models will not be supported")
94-
endif()
95-
96-
file(STRINGS ${SP_INCLUDE_DIR}/sentencepiece_processor.h HAS_SAMPLE_ENCODE REGEX "SampleEncode")
97-
if(HAS_SAMPLE_ENCODE)
98-
add_definitions(-DSP_HAS_SAMPLE_ENCODE)
99-
endif()
100-
101-
file(STRINGS ${SP_INCLUDE_DIR}/sentencepiece_processor.h HAS_VOCAB_RESTRICTION REGEX "SetVocabulary")
102-
if(HAS_VOCAB_RESTRICTION)
103-
add_definitions(-DSP_HAS_VOCAB_RESTRICTION)
104-
endif()
105-
endif()
106-
10783
add_library(${PROJECT_NAME} ${SOURCES})
10884
include(GNUInstallDirs)
10985
include(GenerateExportHeader)

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ See the `-h` flag to list the available options.
7070
### Dependencies
7171

7272
* [ICU](http://site.icu-project.org/)
73-
* (optional) [SentencePiece](https://github.com/google/sentencepiece)
7473

7574
### Compiling
7675

bindings/python/tools/build_wheel.sh

-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ set -e
66
set -x
77

88
ROOT_DIR=$PWD
9-
SENTENCEPIECE_VERSION=${SENTENCEPIECE_VERSION:-0.1.8}
109
PYBIND11_VERSION=${PYBIND11_VERSION:-2.4.3}
1110
ICU_VERSION=${ICU_VERSION:-64.2}
1211
PATH=/opt/python/cp37-cp37m/bin:$PATH
@@ -22,16 +21,6 @@ cd $ROOT_DIR
2221
# Install cmake.
2322
pip install "cmake==3.13.*"
2423

25-
# Build SentencePiece.
26-
curl -L -o sentencepiece-${SENTENCEPIECE_VERSION}.tar.gz -O https://github.com/google/sentencepiece/archive/v${SENTENCEPIECE_VERSION}.tar.gz
27-
tar zxf sentencepiece-${SENTENCEPIECE_VERSION}.tar.gz
28-
cd sentencepiece-${SENTENCEPIECE_VERSION}
29-
mkdir build
30-
cd build
31-
cmake ..
32-
make -j2 install
33-
cd $ROOT_DIR
34-
3524
# Build Tokenizer.
3625
mkdir build
3726
cd build

cli/learn.cc

+1-7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66

77
#include <onmt/Tokenizer.h>
88
#include <onmt/BPELearner.h>
9-
#ifdef WITH_SP_TRAIN
10-
# include <onmt/SPMLearner.h>
11-
#endif
9+
#include <onmt/SPMLearner.h>
1210

1311
#include "tokenization_args.h"
1412

@@ -95,15 +93,11 @@ int main(int argc, char* argv[])
9593

9694
}
9795
else if (subword == "sentencepiece") {
98-
#ifdef WITH_SP_TRAIN
9996
learner = new onmt::SPMLearner(vm["verbose"].as<bool>(),
10097
std::vector<std::string>(subword_args.begin() + 1,
10198
subword_args.end()),
10299
vm["tmpfile"].as<std::string>());
103-
#else
104-
std::cerr << "ERROR: this Tokenizer was not built with SentencePiece training support" << std::endl;
105100
return 1;
106-
#endif
107101
}
108102
else {
109103
std::cerr << "ERROR: invalid subword type: " << subword << " (accepted: bpe, sentencepiece)" << std::endl;

cli/tokenize.cc

+1-7
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44

55
#include <onmt/Tokenizer.h>
66
#include <onmt/BPE.h>
7-
#ifdef WITH_SP
8-
# include <onmt/SentencePiece.h>
9-
#endif
7+
#include <onmt/SentencePiece.h>
108

119
#include "tokenization_args.h"
1210

@@ -33,7 +31,6 @@ int main(int argc, char* argv[])
3331
("bpe_vocab_threshold", "Deprecated, see --vocabulary_threshold",
3432
cxxopts::value<int>()->default_value("50"))
3533

36-
#ifdef WITH_SP
3734
("sp_model_path", "Path to the SentencePiece model",
3835
cxxopts::value<std::string>()->default_value(""))
3936
("s,sp_model", "Aliases for --sp_model_path",
@@ -42,7 +39,6 @@ int main(int argc, char* argv[])
4239
cxxopts::value<int>()->default_value("0"))
4340
("sp_alpha", "Smoothing parameter for the SentencePiece sampling API",
4441
cxxopts::value<float>()->default_value("0.1"))
45-
#endif
4642

4743
("vocabulary", "If provided, sentences are encoded to subword present in this vocabulary",
4844
cxxopts::value<std::string>()->default_value(""))
@@ -76,7 +72,6 @@ int main(int argc, char* argv[])
7672
subword_encoder = new onmt::BPE(bpe_model,
7773
vm["joiner"].as<std::string>(),
7874
vm["bpe_dropout"].as<float>());
79-
#ifdef WITH_SP
8075
else
8176
{
8277
std::string sp_model = (vm.count("sp_model_path")
@@ -87,7 +82,6 @@ int main(int argc, char* argv[])
8782
vm["sp_nbest_size"].as<int>(),
8883
vm["sp_alpha"].as<float>());
8984
}
90-
#endif
9185

9286
if (subword_encoder && !vocabulary.empty())
9387
subword_encoder->load_vocabulary(vocabulary, vocabulary_threshold);

src/SentencePiece.cc

-14
Original file line numberDiff line numberDiff line change
@@ -43,24 +43,14 @@ namespace onmt
4343

4444
void SentencePiece::set_vocabulary(const std::vector<std::string>& vocabulary)
4545
{
46-
#ifdef SP_HAS_VOCAB_RESTRICTION
4746
auto status = _processor->SetVocabulary(vocabulary);
4847
if (!status.ok())
4948
throw std::invalid_argument(status.ToString());
50-
#else
51-
throw std::runtime_error("The project was built against a SentencePiece version "
52-
"that does not support vocabulary restriction");
53-
#endif
5449
}
5550

5651
void SentencePiece::reset_vocabulary()
5752
{
58-
#ifdef SP_HAS_VOCAB_RESTRICTION
5953
_processor->ResetVocabulary();
60-
#else
61-
throw std::runtime_error("The project was built against a SentencePiece version "
62-
"that does not support vocabulary restriction");
63-
#endif
6454
}
6555

6656
void SentencePiece::enable_regularization(int nbest_size, float alpha)
@@ -73,14 +63,10 @@ namespace onmt
7363
{
7464
std::vector<std::string> pieces;
7565

76-
#ifdef SP_HAS_SAMPLE_ENCODE
7766
if (_nbest_size != 0)
7867
_processor->SampleEncode(str, _nbest_size, _alpha, &pieces);
7968
else
80-
#endif
81-
{
8269
_processor->Encode(str, &pieces);
83-
}
8470

8571
return pieces;
8672
}

src/Tokenizer.cc

+1-13
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77
#include <sstream>
88

99
#include "onmt/BPE.h"
10-
#ifdef WITH_SP
11-
# include "onmt/SentencePiece.h"
12-
#endif
10+
#include "onmt/SentencePiece.h"
1311
#include "onmt/unicode/Unicode.h"
1412
#include "Casing.h"
1513
#include "Utils.h"
@@ -134,14 +132,12 @@ namespace onmt
134132
, _joiner(joiner)
135133
{
136134
read_flags(flags);
137-
#ifdef WITH_SP
138135
if (dynamic_cast<const SentencePiece*>(subword_encoder) != nullptr
139136
&& _mode == Mode::None && !_joiner_annotate && !_spacer_annotate)
140137
{
141138
_spacer_annotate = true;
142139
_no_substitution = true;
143140
}
144-
#endif
145141
}
146142

147143
Tokenizer::Tokenizer(const std::string& sp_model_path,
@@ -157,11 +153,7 @@ namespace onmt
157153
read_flags(flags);
158154
set_sp_model(sp_model_path, _cache_model);
159155
if (sp_nbest_size != 0)
160-
#ifdef SP_HAS_SAMPLE_ENCODE
161156
((SentencePiece*)_subword_encoder)->enable_regularization(sp_nbest_size, sp_alpha);
162-
#else
163-
throw std::runtime_error("This version of SentencePiece does not include the sampling API");
164-
#endif
165157
}
166158

167159
void Tokenizer::read_flags(int flags)
@@ -977,16 +969,12 @@ namespace onmt
977969

978970
Tokenizer& Tokenizer::set_sp_model(const std::string& model_path, bool cache_model)
979971
{
980-
#ifdef WITH_SP
981972
if (_mode == Mode::None && !_joiner_annotate && !_spacer_annotate)
982973
{
983974
_spacer_annotate = true;
984975
_no_substitution = true;
985976
}
986977
return this->set_subword_encoder_model<SentencePiece>(model_path, cache_model);
987-
#else
988-
throw std::runtime_error("The Tokenizer was not built with SentencePiece support");
989-
#endif
990978
}
991979

992980
bool Tokenizer::add_alphabet_to_segment(const std::string& alphabet)

test/test.cc

+1-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <gtest/gtest.h>
22

33
#include <onmt/BPE.h>
4+
#include <onmt/SentencePiece.h>
45
#include <onmt/Tokenizer.h>
56
#include <onmt/SpaceTokenizer.h>
67

@@ -640,10 +641,6 @@ TEST(TokenizerTest, CharModeSpacerNew) {
640641
test_tok(tokenizer, " Hello World 123.", "H e l l o ▁ W o r l d ▁ 1 2 3 .");
641642
}
642643

643-
#ifdef WITH_SP
644-
645-
# include <onmt/SentencePiece.h>
646-
647644
TEST(TokenizerTest, SentencePiece) {
648645
Tokenizer tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
649646
get_data("sp-models/sp.model"));
@@ -682,14 +679,12 @@ TEST(TokenizerTest, SentencePieceWithJoinersAndPh_preserve) {
682679
"The two shows ■, called■ ⦅Desire⦆ ■ and S ■e ■c ■re ■t ■s ■, will be one ■- ■hour prime ■- ■time shows ■.");
683680
}
684681

685-
#ifdef SP_HAS_SAMPLE_ENCODE
686682
TEST(TokenizerTest, SentencePieceSubwordRegularization) {
687683
Tokenizer tokenizer(get_data("sp-models/sp_regularization.model"), 1, 0.1);
688684
test_tok_and_detok(tokenizer,
689685
"The two shows, called Desire and Secrets, will be one-hour prime-time shows.",
690686
"▁The ▁ two ▁show s , ▁call ed ▁De si re ▁ and ▁Sec re t s , ▁w ill ▁be ▁one - h our ▁ pri me - t im e ▁show s .");
691687
}
692-
#endif
693688

694689
TEST(TokenizerTest, SentencePieceAlt) {
695690
Tokenizer tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
@@ -762,16 +757,6 @@ TEST(TokenizerTest, AggressiveWithSentencePieceIsolatedSpacerAndJoinerAnnotate)
762757
test_tok(tokenizer, "depending on its temperature.", "depending on its temperature ■.");
763758
}
764759

765-
#else
766-
767-
TEST(TokenizerTest, NoSentencePieceSupport) {
768-
ASSERT_THROW(Tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
769-
get_data("sp-models/sp.model")),
770-
std::runtime_error);
771-
}
772-
773-
#endif
774-
775760
TEST(TokenizerTest, WithoutVocabulary) {
776761
Tokenizer tokenizer(Tokenizer::Mode::Space,
777762
Tokenizer::Flags::JoinerAnnotate,

third_party/sentencepiece

Submodule sentencepiece added at cf98a47

0 commit comments

Comments
 (0)