OpenNMT
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.travis.yml
+27 b/‎.travis.yml
+27
diff --git a/‎CHANGELOG.md
+3 b/‎CHANGELOG.md
+3
diff --git a/‎CMakeLists.txt
+32 b/‎CMakeLists.txt
+32
diff --git a/‎LICENSE.md
+19 b/‎LICENSE.md
+19
diff --git a/‎README.md
+42 b/‎README.md
+42
diff --git a/‎cli/CMakeLists.txt
+21 b/‎cli/CMakeLists.txt
+21
diff --git a/‎cli/detokenize.cc
+43 b/‎cli/detokenize.cc
+43
diff --git a/‎cli/tokenize.cc
+52 b/‎cli/tokenize.cc
+52
diff --git a/‎include/onmt/BPE.h
+36 b/‎include/onmt/BPE.h
+36
diff --git a/‎include/onmt/CaseModifier.h
+33 b/‎include/onmt/CaseModifier.h
+33
diff --git a/‎include/onmt/ITokenizer.h
+29 b/‎include/onmt/ITokenizer.h
+29
diff --git a/‎include/onmt/SpaceTokenizer.h
+24 b/‎include/onmt/SpaceTokenizer.h
+24
@@ -0,0 +1 @@
+build
@@ -0,0 +1,27 @@
+sudo: false
+language: cpp
+compiler:
+  - gcc
+  - clang
+addons:
+  apt:
+    sources:
+      - george-edison55-precise-backports
+      - ubuntu-toolchain-r-test
+    packages:
+      - gcc-4.8
+      - g++-4.8
+      - cmake
+      - cmake-data
+      - libboost-dev
+      - libboost-program-options-dev
+before_install:
+  - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
+  - export ROOT_TRAVIS_DIR=$(pwd)
+  - cd $ROOT_TRAVIS_DIR
+install:
+  - mkdir build && cd build
+  - cmake -DCMAKE_BUILD_TYPE=Release ..
+  - make
+script:
+  - make check
@@ -0,0 +1,3 @@
+## [v0.1.0](https://github.com/OpenNMT/Tokenizer/releases/tag/v0.1.0) (2017-02-14)
+
+Initial release.
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.1.0)
+project(tokenizer)
+
+set(CMAKE_CXX_STANDARD 11)
+
+if(MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Wall")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+endif()
+
+set(INCLUDE_DIRECTORIES
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  )
+include_directories(${INCLUDE_DIRECTORIES})
+
+add_library(${PROJECT_NAME} SHARED
+  src/BPE.cc
+  src/CaseModifier.cc
+  src/ITokenizer.cc
+  src/SpaceTokenizer.cc
+  src/Tokenizer.cc
+  src/unicode/Data.cc
+  src/unicode/Unicode.cc
+  )
+
+target_include_directories(${PROJECT_NAME} PUBLIC ${INCLUDE_DIRECTORIES})
+
+if (NOT LIB_ONLY)
+  add_subdirectory(cli)
+  add_subdirectory(test)
+endif()
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,42 @@
+[![Build Status](https://api.travis-ci.org/OpenNMT/Tokenizer.svg?branch=master)](https://travis-ci.org/OpenNMT/Tokenizer)
+
+# Tokenizer
+
+Tokenizer is a C++ implementation of OpenNMT tokenization and detokenization.
+
+## Dependencies
+
+Compiling executables requires:
+
+* `Boost` (`program_options`)
+
+## Compiling
+
+*CMake and a compiler that supports the C++11 standard are required to compile the project.*
+
+```
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=<Release or Debug> ..
+make
+```
+
+It will produce the dynamic library `libtokenizer.so` (or `.dylib` on Mac OS, `.dll` on Windows), and the tokenization tools `cli/tokenize` and `cli/detokenize`.
+
+### Options
+
+* To compile only the library, use the `-DLIB_ONLY=ON` flag.
+
+## Using
+
+### Clients
+
+See `--help` on the clients to discover available options and usage. They have the same interface as their Lua counterpart.
+
+### Library
+
+This project is also a convenient way to apply OpenNMT tokenization in existing software.
+
+See:
+
+* `include/onmt/Tokenizer.h` to apply OpenNMT's tokenization and detokenization
@@ -0,0 +1,21 @@
+find_package(Boost COMPONENTS program_options)
+
+include_directories(
+  ${Boost_INCLUDE_DIRS}
+  )
+
+add_executable(tokenize
+  tokenize.cc
+  )
+target_link_libraries(tokenize
+  ${PROJECT_NAME}
+  ${Boost_LIBRARIES}
+  )
+
+add_executable(detokenize
+  detokenize.cc
+  )
+target_link_libraries(detokenize
+  ${PROJECT_NAME}
+  ${Boost_LIBRARIES}
+  )
@@ -0,0 +1,43 @@
+#include <iostream>
+
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <onmt/Tokenizer.h>
+
+namespace po = boost::program_options;
+
+int main(int argc, char* argv[])
+{
+  po::options_description desc("Detokenization");
+  desc.add_options()
+    ("help", "display available options")
+    ("joiner", po::value<std::string>()->default_value(onmt::Tokenizer::joiner_marker), "character used to annotate joiners")
+    ("case_feature", po::bool_switch()->default_value(false), "first feature is the case")
+    ;
+
+  po::variables_map vm;
+  po::store(po::parse_command_line(argc, argv, desc), vm);
+  po::notify(vm);
+
+  if (vm.count("help"))
+  {
+    std::cerr << desc << std::endl;
+    return 1;
+  }
+
+  onmt::ITokenizer* tokenizer = new onmt::Tokenizer(vm["case_feature"].as<bool>(),
+                                                    vm["joiner"].as<std::string>());
+
+  std::string line;
+
+  while (std::getline(std::cin, line))
+  {
+    if (!line.empty())
+      std::cout << tokenizer->detokenize(line);
+
+    std::cout << std::endl;
+  }
+
+  return 0;
+}
@@ -0,0 +1,52 @@
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+#include <onmt/Tokenizer.h>
+
+namespace po = boost::program_options;
+
+int main(int argc, char* argv[])
+{
+  po::options_description desc("Tokenization");
+  desc.add_options()
+    ("help", "display available options")
+    ("mode", po::value<std::string>()->default_value("conservative"), "Define how aggressive should the tokenization be: 'aggressive' only keeps sequences of letters/numbers, 'conservative' allows mix of alphanumeric as in: '2,000', 'E65', 'soft-landing'")
+    ("joiner_annotate", po::bool_switch()->default_value(false), "include joiner annotation using 'joiner' character")
+    ("joiner", po::value<std::string>()->default_value(onmt::Tokenizer::joiner_marker), "character used to annotate joiners")
+    ("joiner_new", po::bool_switch()->default_value(false), "in joiner_annotate mode, 'joiner' is an independent token")
+    ("case_feature", po::bool_switch()->default_value(false), "lowercase corpus and generate case feature")
+    ("bpe_model", po::value<std::string>()->default_value(""), "path to the BPE model")
+    ;
+
+  po::variables_map vm;
+  po::store(po::parse_command_line(argc, argv, desc), vm);
+  po::notify(vm);
+
+  if (vm.count("help"))
+  {
+    std::cerr << desc << std::endl;
+    return 1;
+  }
+
+  onmt::ITokenizer* tokenizer = new onmt::Tokenizer(vm["mode"].as<std::string>() == "aggressive"
+                                                    ? onmt::Tokenizer::Mode::Aggressive
+                                                    : onmt::Tokenizer::Mode::Conservative,
+                                                    vm["bpe_model"].as<std::string>(),
+                                                    vm["case_feature"].as<bool>(),
+                                                    vm["joiner_annotate"].as<bool>(),
+                                                    vm["joiner_new"].as<bool>(),
+                                                    vm["joiner"].as<std::string>());
+
+  std::string line;
+
+  while (std::getline(std::cin, line))
+  {
+    if (!line.empty())
+      std::cout << tokenizer->tokenize(line);
+
+    std::cout << std::endl;
+  }
+
+  return 0;
+}
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace onmt
+{
+
+  class BPE
+  {
+  public:
+    BPE(const std::string& model);
+
+    std::vector<std::string> encode(const std::string& str) const;
+
+  private:
+    static const std::string end_of_word;
+
+    struct pair_hash {
+    public:
+      template <typename T, typename U>
+      std::size_t operator()(const std::pair<T, U> &x) const
+      {
+        return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
+      }
+    };
+
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> _codes;
+
+    std::pair<std::string, std::string>
+    get_min_pair(const std::vector<std::pair<std::string, std::string> >& pairs) const;
+
+  };
+
+}
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <string>
+
+#include "onmt/unicode/Unicode.h"
+
+namespace onmt
+{
+
+  class CaseModifier
+  {
+  public:
+    static std::pair<std::string, char> extract_case(const std::string& token);
+    static std::string apply_case(const std::string& token, char feat);
+
+  private:
+    enum class Type
+    {
+      Lowercase,
+      Uppercase,
+      Mixed,
+      Capitalized,
+      CapitalizedFirst,
+      None
+    };
+
+    static Type update_type(Type current, unicode::_type_letter type);
+
+    static char type_to_char(Type type);
+    static Type char_to_type(char feature);
+  };
+
+}
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+namespace onmt
+{
+
+  class ITokenizer
+  {
+  public:
+    static const std::string feature_marker;
+
+    virtual ~ITokenizer() = default;
+
+    virtual void tokenize(const std::string& text,
+                          std::vector<std::string>& words,
+                          std::vector<std::vector<std::string> >& features) = 0;
+    virtual std::string detokenize(const std::vector<std::string>& words,
+                                   const std::vector<std::vector<std::string> >& features) = 0;
+
+    // Tokenize and use spaces as token separators.
+    virtual std::string tokenize(const std::string& text);
+
+    // Split the text on spaces and detokenize.
+    virtual std::string detokenize(const std::string& text);
+  };
+
+}
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "onmt/ITokenizer.h"
+
+namespace onmt
+{
+
+  // This Tokenizer simply splits on spaces. Useful when the text was tokenized
+  // with an external tool.
+  class SpaceTokenizer: public ITokenizer
+  {
+  public:
+    static ITokenizer& get_instance();
+
+    void tokenize(const std::string& text,
+                  std::vector<std::string>& words,
+                  std::vector<std::vector<std::string> >& features) override;
+
+    std::string detokenize(const std::vector<std::string>& words,
+                           const std::vector<std::vector<std::string> >& features) override;
+
+  };
+
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+## [v0.1.0](https://github.com/OpenNMT/Tokenizer/releases/tag/v0.1.0) (2017-02-14)`
	`2`	`+`
	`3`	`+Initial release.`