Add SentencePiece as a Git submodule (#167)

guillaumekln · web-flow · commit 79dc8dbd7c7e · 2020-09-11T12:03:41.000+02:00
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "third_party/cxxopts"]
 	path = third_party/cxxopts
 	url = https://github.com/jarro2783/cxxopts.git
+[submodule "third_party/sentencepiece"]
+	path = third_party/sentencepiece
+	url = https://github.com/google/sentencepiece.git
diff --git a/.travis.yml b/.travis.yml
@@ -3,33 +3,12 @@ language: cpp
 compiler:
   - gcc
   - clang
-env:
-  global:
-    - SENTENCEPIECE_VERSION="0.1.8"
-cache:
-  directories:
-    - $HOME/sentencepiece-$SENTENCEPIECE_VERSION/
-before_install:
-  - export ROOT_TRAVIS_DIR=$(pwd)
-  - |
-    if [ ! -d $HOME/sentencepiece-$SENTENCEPIECE_VERSION/lib ]; then
-      wget https://github.com/google/sentencepiece/archive/v$SENTENCEPIECE_VERSION.tar.gz
-      tar xf v$SENTENCEPIECE_VERSION.tar.gz
-      cd sentencepiece-$SENTENCEPIECE_VERSION
-      mkdir build
-      cd build
-      cmake -DCMAKE_INSTALL_PREFIX=$HOME/sentencepiece-$SENTENCEPIECE_VERSION ..
-      make
-      make install
-      cd $ROOT_TRAVIS_DIR
-    fi
 install:
   - export TOKENIZER_ROOT=$HOME/Tokenizer
-  - export SENTENCEPIECE_ROOT=$HOME/sentencepiece-$SENTENCEPIECE_VERSION
   - mkdir build && cd build
-  - cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT ..
+  - cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT ..
   - make install
-  - cd $ROOT_TRAVIS_DIR
+  - cd ..
 script:
   - build/test/onmt_tokenizer_test test/data
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,6 +17,10 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif(NOT CMAKE_BUILD_TYPE)
 
+option(SPM_ENABLE_SHARED "" OFF)
+option(SPM_ENABLE_TCMALLOC "" OFF)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sentencepiece EXCLUDE_FROM_ALL)
+
 if(MSVC)
   if(NOT BUILD_SHARED_LIBS)
     if(CMAKE_VERSION VERSION_LESS "3.15.0")
@@ -32,16 +36,22 @@ endif()
 find_package(ICU REQUIRED)
 
 set(INCLUDE_DIRECTORIES
+  PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${PROJECT_BINARY_DIR}
+
+  PRIVATE
   ${ICU_INCLUDE_DIRS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/third_party/sentencepiece/src
   )
 
 set(PUBLIC_HEADERS
   include/onmt/Token.h
   include/onmt/BPE.h
   include/onmt/BPELearner.h
   include/onmt/ITokenizer.h
+  include/onmt/SPMLearner.h
+  include/onmt/SentencePiece.h
   include/onmt/SpaceTokenizer.h
   include/onmt/SubwordEncoder.h
   include/onmt/SubwordLearner.h
@@ -53,6 +63,8 @@ set(SOURCES
   src/BPELearner.cc
   src/Casing.cc
   src/ITokenizer.cc
+  src/SPMLearner.cc
+  src/SentencePiece.cc
   src/SpaceTokenizer.cc
   src/SubwordEncoder.cc
   src/SubwordLearner.cc
@@ -64,46 +76,10 @@ set(SOURCES
 
 list(APPEND LINK_LIBRARIES
   ${ICU_LIBRARIES}
+  sentencepiece-static
+  sentencepiece_train-static
   )
 
-find_library(SP_LIBRARY NAMES sentencepiece)
-find_path(SP_INCLUDE_DIR NAMES sentencepiece_processor.h)
-
-if(NOT SP_LIBRARY OR NOT SP_INCLUDE_DIR)
-  message(WARNING "sentencepiece not found; will not be supported")
-else()
-  message(STATUS "Found sentencepiece: ${SP_LIBRARY}")
-  add_definitions(-DWITH_SP)
-  list(APPEND PUBLIC_HEADERS include/onmt/SentencePiece.h)
-  list(APPEND SOURCES src/SentencePiece.cc)
-  list(APPEND INCLUDE_DIRECTORIES ${SP_INCLUDE_DIR})
-  list(APPEND LINK_LIBRARIES ${SP_LIBRARY})
-
-  find_library(SP_TRAIN_LIBRARY NAMES sentencepiece_train)
-  if(SP_TRAIN_LIBRARY)
-    find_package(Threads)
-    add_definitions(-DWITH_SP_TRAIN)
-    message(STATUS "Found sentencepiece_train: ${SP_TRAIN_LIBRARY}")
-    list(APPEND PUBLIC_HEADERS include/onmt/SPMLearner.h)
-    list(APPEND SOURCES src/SPMLearner.cc)
-    list(APPEND LINK_LIBRARIES
-      ${SP_TRAIN_LIBRARY}
-      ${CMAKE_THREAD_LIBS_INIT})
-  else()
-    message(WARNING "sentencepiece_train not found: training SentencePiece models will not be supported")
-  endif()
-
-  file(STRINGS ${SP_INCLUDE_DIR}/sentencepiece_processor.h HAS_SAMPLE_ENCODE REGEX "SampleEncode")
-  if(HAS_SAMPLE_ENCODE)
-    add_definitions(-DSP_HAS_SAMPLE_ENCODE)
-  endif()
-
-  file(STRINGS ${SP_INCLUDE_DIR}/sentencepiece_processor.h HAS_VOCAB_RESTRICTION REGEX "SetVocabulary")
-  if(HAS_VOCAB_RESTRICTION)
-    add_definitions(-DSP_HAS_VOCAB_RESTRICTION)
-  endif()
-endif()
-
 add_library(${PROJECT_NAME} ${SOURCES})
 include(GNUInstallDirs)
 include(GenerateExportHeader)
diff --git a/README.md b/README.md
@@ -70,7 +70,6 @@ See the `-h` flag to list the available options.
 ### Dependencies
 
 * [ICU](http://site.icu-project.org/)
-* (optional) [SentencePiece](https://github.com/google/sentencepiece)
 
 ### Compiling
 
diff --git a/bindings/python/tools/build_wheel.sh b/bindings/python/tools/build_wheel.sh
@@ -6,7 +6,6 @@ set -e
 set -x
 
 ROOT_DIR=$PWD
-SENTENCEPIECE_VERSION=${SENTENCEPIECE_VERSION:-0.1.8}
 PYBIND11_VERSION=${PYBIND11_VERSION:-2.4.3}
 ICU_VERSION=${ICU_VERSION:-64.2}
 PATH=/opt/python/cp37-cp37m/bin:$PATH
@@ -22,16 +21,6 @@ cd $ROOT_DIR
 # Install cmake.
 pip install "cmake==3.13.*"
 
-# Build SentencePiece.
-curl -L -o sentencepiece-${SENTENCEPIECE_VERSION}.tar.gz -O https://github.com/google/sentencepiece/archive/v${SENTENCEPIECE_VERSION}.tar.gz
-tar zxf sentencepiece-${SENTENCEPIECE_VERSION}.tar.gz
-cd sentencepiece-${SENTENCEPIECE_VERSION}
-mkdir build
-cd build
-cmake ..
-make -j2 install
-cd $ROOT_DIR
-
 # Build Tokenizer.
 mkdir build
 cd build
diff --git a/cli/learn.cc b/cli/learn.cc
@@ -6,9 +6,7 @@
 
 #include <onmt/Tokenizer.h>
 #include <onmt/BPELearner.h>
-#ifdef WITH_SP_TRAIN
-#  include <onmt/SPMLearner.h>
-#endif
+#include <onmt/SPMLearner.h>
 
 #include "tokenization_args.h"
 
@@ -95,15 +93,11 @@ int main(int argc, char* argv[])
 
   }
   else if (subword == "sentencepiece") {
-#ifdef WITH_SP_TRAIN
     learner = new onmt::SPMLearner(vm["verbose"].as<bool>(),
                                    std::vector<std::string>(subword_args.begin() + 1,
                                                             subword_args.end()),
                                    vm["tmpfile"].as<std::string>());
-#else
-    std::cerr << "ERROR: this Tokenizer was not built with SentencePiece training support" << std::endl;
     return 1;
-#endif
   }
   else {
     std::cerr << "ERROR: invalid subword type: " << subword << " (accepted: bpe, sentencepiece)" << std::endl;
diff --git a/cli/tokenize.cc b/cli/tokenize.cc
@@ -4,9 +4,7 @@
 
 #include <onmt/Tokenizer.h>
 #include <onmt/BPE.h>
-#ifdef WITH_SP
-#  include <onmt/SentencePiece.h>
-#endif
+#include <onmt/SentencePiece.h>
 
 #include "tokenization_args.h"
 
@@ -33,7 +31,6 @@ int main(int argc, char* argv[])
     ("bpe_vocab_threshold", "Deprecated, see --vocabulary_threshold",
      cxxopts::value<int>()->default_value("50"))
 
-#ifdef WITH_SP
     ("sp_model_path", "Path to the SentencePiece model",
      cxxopts::value<std::string>()->default_value(""))
     ("s,sp_model", "Aliases for --sp_model_path",
@@ -42,7 +39,6 @@ int main(int argc, char* argv[])
      cxxopts::value<int>()->default_value("0"))
     ("sp_alpha", "Smoothing parameter for the SentencePiece sampling API",
      cxxopts::value<float>()->default_value("0.1"))
-#endif
 
     ("vocabulary", "If provided, sentences are encoded to subword present in this vocabulary",
      cxxopts::value<std::string>()->default_value(""))
@@ -76,7 +72,6 @@ int main(int argc, char* argv[])
     subword_encoder = new onmt::BPE(bpe_model,
                                     vm["joiner"].as<std::string>(),
                                     vm["bpe_dropout"].as<float>());
-#ifdef WITH_SP
   else
   {
     std::string sp_model = (vm.count("sp_model_path")
@@ -87,7 +82,6 @@ int main(int argc, char* argv[])
                                                 vm["sp_nbest_size"].as<int>(),
                                                 vm["sp_alpha"].as<float>());
   }
-#endif
 
   if (subword_encoder && !vocabulary.empty())
     subword_encoder->load_vocabulary(vocabulary, vocabulary_threshold);
diff --git a/src/SentencePiece.cc b/src/SentencePiece.cc
@@ -43,24 +43,14 @@ namespace onmt
 
   void SentencePiece::set_vocabulary(const std::vector<std::string>& vocabulary)
   {
-#ifdef SP_HAS_VOCAB_RESTRICTION
     auto status = _processor->SetVocabulary(vocabulary);
     if (!status.ok())
       throw std::invalid_argument(status.ToString());
-#else
-    throw std::runtime_error("The project was built against a SentencePiece version "
-                             "that does not support vocabulary restriction");
-#endif
   }
 
   void SentencePiece::reset_vocabulary()
   {
-#ifdef SP_HAS_VOCAB_RESTRICTION
     _processor->ResetVocabulary();
-#else
-    throw std::runtime_error("The project was built against a SentencePiece version "
-                             "that does not support vocabulary restriction");
-#endif
   }
 
   void SentencePiece::enable_regularization(int nbest_size, float alpha)
@@ -73,14 +63,10 @@ namespace onmt
   {
     std::vector<std::string> pieces;
 
-#ifdef SP_HAS_SAMPLE_ENCODE
     if (_nbest_size != 0)
       _processor->SampleEncode(str, _nbest_size, _alpha, &pieces);
     else
-#endif
-    {
       _processor->Encode(str, &pieces);
-    }
 
     return pieces;
   }
diff --git a/src/Tokenizer.cc b/src/Tokenizer.cc
@@ -7,9 +7,7 @@
 #include <sstream>
 
 #include "onmt/BPE.h"
-#ifdef WITH_SP
-#  include "onmt/SentencePiece.h"
-#endif
+#include "onmt/SentencePiece.h"
 #include "onmt/unicode/Unicode.h"
 #include "Casing.h"
 #include "Utils.h"
@@ -134,14 +132,12 @@ namespace onmt
     , _joiner(joiner)
   {
     read_flags(flags);
-#ifdef WITH_SP
     if (dynamic_cast<const SentencePiece*>(subword_encoder) != nullptr
         && _mode == Mode::None && !_joiner_annotate && !_spacer_annotate)
     {
       _spacer_annotate = true;
       _no_substitution = true;
     }
-#endif
   }
 
   Tokenizer::Tokenizer(const std::string& sp_model_path,
@@ -157,11 +153,7 @@ namespace onmt
     read_flags(flags);
     set_sp_model(sp_model_path, _cache_model);
     if (sp_nbest_size != 0)
-#ifdef SP_HAS_SAMPLE_ENCODE
       ((SentencePiece*)_subword_encoder)->enable_regularization(sp_nbest_size, sp_alpha);
-#else
-      throw std::runtime_error("This version of SentencePiece does not include the sampling API");
-#endif
   }
 
   void Tokenizer::read_flags(int flags)
@@ -977,16 +969,12 @@ namespace onmt
 
   Tokenizer& Tokenizer::set_sp_model(const std::string& model_path, bool cache_model)
   {
-#ifdef WITH_SP
     if (_mode == Mode::None && !_joiner_annotate && !_spacer_annotate)
     {
       _spacer_annotate = true;
       _no_substitution = true;
     }
     return this->set_subword_encoder_model<SentencePiece>(model_path, cache_model);
-#else
-    throw std::runtime_error("The Tokenizer was not built with SentencePiece support");
-#endif
   }
 
   bool Tokenizer::add_alphabet_to_segment(const std::string& alphabet)
diff --git a/test/test.cc b/test/test.cc
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 
 #include <onmt/BPE.h>
+#include <onmt/SentencePiece.h>
 #include <onmt/Tokenizer.h>
 #include <onmt/SpaceTokenizer.h>
 
@@ -640,10 +641,6 @@ TEST(TokenizerTest, CharModeSpacerNew) {
   test_tok(tokenizer, "  Hello   World 123.", "H e l l o ▁ W o r l d ▁ 1 2 3 .");
 }
 
-#ifdef WITH_SP
-
-#  include <onmt/SentencePiece.h>
-
 TEST(TokenizerTest, SentencePiece) {
   Tokenizer tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
                       get_data("sp-models/sp.model"));
@@ -682,14 +679,12 @@ TEST(TokenizerTest, SentencePieceWithJoinersAndPh_preserve) {
                      "The two shows ￭, called￭ ｟Desire｠ ￭ and S ￭e ￭c ￭re ￭t ￭s ￭, will be one ￭- ￭hour prime ￭- ￭time shows ￭.");
 }
 
-#ifdef SP_HAS_SAMPLE_ENCODE
 TEST(TokenizerTest, SentencePieceSubwordRegularization) {
   Tokenizer tokenizer(get_data("sp-models/sp_regularization.model"), 1, 0.1);
   test_tok_and_detok(tokenizer,
                      "The two shows, called Desire and Secrets, will be one-hour prime-time shows.",
                      "▁The ▁ two ▁show s , ▁call ed ▁De si re ▁ and ▁Sec re t s , ▁w ill ▁be ▁one - h our ▁ pri me - t im e ▁show s .");
 }
-#endif
 
 TEST(TokenizerTest, SentencePieceAlt) {
   Tokenizer tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
@@ -762,16 +757,6 @@ TEST(TokenizerTest, AggressiveWithSentencePieceIsolatedSpacerAndJoinerAnnotate)
   test_tok(tokenizer, "depending on its temperature.", "depending on its temperature ￭.");
 }
 
-#else
-
-TEST(TokenizerTest, NoSentencePieceSupport) {
-  ASSERT_THROW(Tokenizer(Tokenizer::Mode::None, Tokenizer::Flags::SentencePieceModel,
-                         get_data("sp-models/sp.model")),
-               std::runtime_error);
-}
-
-#endif
-
 TEST(TokenizerTest, WithoutVocabulary) {
   Tokenizer tokenizer(Tokenizer::Mode::Space,
                       Tokenizer::Flags::JoinerAnnotate,
diff --git a/third_party/sentencepiece b/third_party/sentencepiece
@@ -0,0 +1 @@
+Subproject commit cf98a47b55cbdb503e04c99427baa361ebc6bd75

Original file line number	Diff line number	Diff line change
`@@ -43,24 +43,14 @@ namespace onmt`
`43`	`43`
`44`	`44`	`void SentencePiece::set_vocabulary(const std::vector<std::string>& vocabulary)`
`45`	`45`	`{`
`46`		`-#ifdef SP_HAS_VOCAB_RESTRICTION`
`47`	`46`	`auto status = _processor->SetVocabulary(vocabulary);`
`48`	`47`	`if (!status.ok())`
`49`	`48`	`throw std::invalid_argument(status.ToString());`
`50`		`-#else`
`51`		`- throw std::runtime_error("The project was built against a SentencePiece version "`
`52`		`- "that does not support vocabulary restriction");`
`53`		`-#endif`
`54`	`49`	`}`
`55`	`50`
`56`	`51`	`void SentencePiece::reset_vocabulary()`
`57`	`52`	`{`
`58`		`-#ifdef SP_HAS_VOCAB_RESTRICTION`
`59`	`53`	`_processor->ResetVocabulary();`
`60`		`-#else`
`61`		`- throw std::runtime_error("The project was built against a SentencePiece version "`
`62`		`- "that does not support vocabulary restriction");`
`63`		`-#endif`
`64`	`54`	`}`
`65`	`55`
`66`	`56`	`void SentencePiece::enable_regularization(int nbest_size, float alpha)`
`@@ -73,14 +63,10 @@ namespace onmt`
`73`	`63`	`{`
`74`	`64`	`std::vector<std::string> pieces;`
`75`	`65`
`76`		`-#ifdef SP_HAS_SAMPLE_ENCODE`
`77`	`66`	`if (_nbest_size != 0)`
`78`	`67`	`_processor->SampleEncode(str, _nbest_size, _alpha, &pieces);`
`79`	`68`	`else`
`80`		`-#endif`
`81`		`- {`
`82`	`69`	`_processor->Encode(str, &pieces);`
`83`		`- }`
`84`	`70`
`85`	`71`	`return pieces;`
`86`	`72`	`}`
Original file line number	Diff line number	Diff line change
`@@ -7,9 +7,7 @@`
`7`	`7`	`#include <sstream>`
`8`	`8`
`9`	`9`	`#include "onmt/BPE.h"`
`10`		`-#ifdef WITH_SP`
`11`		`-# include "onmt/SentencePiece.h"`
`12`		`-#endif`
	`10`	`+#include "onmt/SentencePiece.h"`
`13`	`11`	`#include "onmt/unicode/Unicode.h"`
`14`	`12`	`#include "Casing.h"`
`15`	`13`	`#include "Utils.h"`
`@@ -134,14 +132,12 @@ namespace onmt`
`134`	`132`	`, _joiner(joiner)`
`135`	`133`	`{`
`136`	`134`	`read_flags(flags);`
`137`		`-#ifdef WITH_SP`
`138`	`135`	`if (dynamic_cast<const SentencePiece*>(subword_encoder) != nullptr`
`139`	`136`	`&& _mode == Mode::None && !_joiner_annotate && !_spacer_annotate)`
`140`	`137`	`{`
`141`	`138`	`_spacer_annotate = true;`
`142`	`139`	`_no_substitution = true;`
`143`	`140`	`}`
`144`		`-#endif`
`145`	`141`	`}`
`146`	`142`
`147`	`143`	`Tokenizer::Tokenizer(const std::string& sp_model_path,`
`@@ -157,11 +153,7 @@ namespace onmt`
`157`	`153`	`read_flags(flags);`
`158`	`154`	`set_sp_model(sp_model_path, _cache_model);`
`159`	`155`	`if (sp_nbest_size != 0)`
`160`		`-#ifdef SP_HAS_SAMPLE_ENCODE`
`161`	`156`	`((SentencePiece*)_subword_encoder)->enable_regularization(sp_nbest_size, sp_alpha);`
`162`		`-#else`
`163`		`- throw std::runtime_error("This version of SentencePiece does not include the sampling API");`
`164`		`-#endif`
`165`	`157`	`}`
`166`	`158`
`167`	`159`	`void Tokenizer::read_flags(int flags)`
`@@ -977,16 +969,12 @@ namespace onmt`
`977`	`969`
`978`	`970`	`Tokenizer& Tokenizer::set_sp_model(const std::string& model_path, bool cache_model)`
`979`	`971`	`{`
`980`		`-#ifdef WITH_SP`
`981`	`972`	`if (_mode == Mode::None && !_joiner_annotate && !_spacer_annotate)`
`982`	`973`	`{`
`983`	`974`	`_spacer_annotate = true;`
`984`	`975`	`_no_substitution = true;`
`985`	`976`	`}`
`986`	`977`	`return this->set_subword_encoder_model<SentencePiece>(model_path, cache_model);`
`987`		`-#else`
`988`		`- throw std::runtime_error("The Tokenizer was not built with SentencePiece support");`
`989`		`-#endif`
`990`	`978`	`}`
`991`	`979`
`992`	`980`	`bool Tokenizer::add_alphabet_to_segment(const std::string& alphabet)`