Skip to content

Commit 8d363d9

Browse files
committed
Initial release of standalone tokenizer
0 parents  commit 8d363d9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2455
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
build

.travis.yml

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
sudo: false
2+
language: cpp
3+
compiler:
4+
- gcc
5+
- clang
6+
addons:
7+
apt:
8+
sources:
9+
- george-edison55-precise-backports
10+
- ubuntu-toolchain-r-test
11+
packages:
12+
- gcc-4.8
13+
- g++-4.8
14+
- cmake
15+
- cmake-data
16+
- libboost-dev
17+
- libboost-program-options-dev
18+
before_install:
19+
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
20+
- export ROOT_TRAVIS_DIR=$(pwd)
21+
- cd $ROOT_TRAVIS_DIR
22+
install:
23+
- mkdir build && cd build
24+
- cmake -DCMAKE_BUILD_TYPE=Release ..
25+
- make
26+
script:
27+
- make check

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
## [v0.1.0](https://github.com/OpenNMT/Tokenizer/releases/tag/v0.1.0) (2017-02-14)
2+
3+
Initial release.

CMakeLists.txt

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
cmake_minimum_required(VERSION 3.1.0)
2+
project(tokenizer)
3+
4+
set(CMAKE_CXX_STANDARD 11)
5+
6+
if(MSVC)
7+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Wall")
8+
else()
9+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
10+
endif()
11+
12+
set(INCLUDE_DIRECTORIES
13+
${CMAKE_CURRENT_SOURCE_DIR}/include
14+
)
15+
include_directories(${INCLUDE_DIRECTORIES})
16+
17+
add_library(${PROJECT_NAME} SHARED
18+
src/BPE.cc
19+
src/CaseModifier.cc
20+
src/ITokenizer.cc
21+
src/SpaceTokenizer.cc
22+
src/Tokenizer.cc
23+
src/unicode/Data.cc
24+
src/unicode/Unicode.cc
25+
)
26+
27+
target_include_directories(${PROJECT_NAME} PUBLIC ${INCLUDE_DIRECTORIES})
28+
29+
if (NOT LIB_ONLY)
30+
add_subdirectory(cli)
31+
add_subdirectory(test)
32+
endif()

LICENSE.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
The MIT License (MIT)
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in
11+
all copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19+
THE SOFTWARE.

README.md

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[![Build Status](https://api.travis-ci.org/OpenNMT/Tokenizer.svg?branch=master)](https://travis-ci.org/OpenNMT/Tokenizer)
2+
3+
# Tokenizer
4+
5+
Tokenizer is a C++ implementation of OpenNMT tokenization and detokenization.
6+
7+
## Dependencies
8+
9+
Compiling executables requires:
10+
11+
* `Boost` (`program_options`)
12+
13+
## Compiling
14+
15+
*CMake and a compiler that supports the C++11 standard are required to compile the project.*
16+
17+
```
18+
mkdir build
19+
cd build
20+
cmake -DCMAKE_BUILD_TYPE=<Release or Debug> ..
21+
make
22+
```
23+
24+
It will produce the dynamic library `libtokenizer.so` (or `.dylib` on Mac OS, `.dll` on Windows), and the tokenization tools `cli/tokenize` and `cli/detokenize`.
25+
26+
### Options
27+
28+
* To compile only the library, use the `-DLIB_ONLY=ON` flag.
29+
30+
## Using
31+
32+
### Clients
33+
34+
See `--help` on the clients to discover available options and usage. They have the same interface as their Lua counterpart.
35+
36+
### Library
37+
38+
This project is also a convenient way to apply OpenNMT tokenization in existing software.
39+
40+
See:
41+
42+
* `include/onmt/Tokenizer.h` to apply OpenNMT's tokenization and detokenization

cli/CMakeLists.txt

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
find_package(Boost COMPONENTS program_options)
2+
3+
include_directories(
4+
${Boost_INCLUDE_DIRS}
5+
)
6+
7+
add_executable(tokenize
8+
tokenize.cc
9+
)
10+
target_link_libraries(tokenize
11+
${PROJECT_NAME}
12+
${Boost_LIBRARIES}
13+
)
14+
15+
add_executable(detokenize
16+
detokenize.cc
17+
)
18+
target_link_libraries(detokenize
19+
${PROJECT_NAME}
20+
${Boost_LIBRARIES}
21+
)

cli/detokenize.cc

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#include <iostream>
2+
3+
#include <boost/program_options.hpp>
4+
#include <boost/algorithm/string.hpp>
5+
6+
#include <onmt/Tokenizer.h>
7+
8+
namespace po = boost::program_options;
9+
10+
int main(int argc, char* argv[])
11+
{
12+
po::options_description desc("Detokenization");
13+
desc.add_options()
14+
("help", "display available options")
15+
("joiner", po::value<std::string>()->default_value(onmt::Tokenizer::joiner_marker), "character used to annotate joiners")
16+
("case_feature", po::bool_switch()->default_value(false), "first feature is the case")
17+
;
18+
19+
po::variables_map vm;
20+
po::store(po::parse_command_line(argc, argv, desc), vm);
21+
po::notify(vm);
22+
23+
if (vm.count("help"))
24+
{
25+
std::cerr << desc << std::endl;
26+
return 1;
27+
}
28+
29+
onmt::ITokenizer* tokenizer = new onmt::Tokenizer(vm["case_feature"].as<bool>(),
30+
vm["joiner"].as<std::string>());
31+
32+
std::string line;
33+
34+
while (std::getline(std::cin, line))
35+
{
36+
if (!line.empty())
37+
std::cout << tokenizer->detokenize(line);
38+
39+
std::cout << std::endl;
40+
}
41+
42+
return 0;
43+
}

cli/tokenize.cc

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <iostream>
2+
3+
#include <boost/program_options.hpp>
4+
5+
#include <onmt/Tokenizer.h>
6+
7+
namespace po = boost::program_options;
8+
9+
int main(int argc, char* argv[])
10+
{
11+
po::options_description desc("Tokenization");
12+
desc.add_options()
13+
("help", "display available options")
14+
("mode", po::value<std::string>()->default_value("conservative"), "Define how aggressive should the tokenization be: 'aggressive' only keeps sequences of letters/numbers, 'conservative' allows mix of alphanumeric as in: '2,000', 'E65', 'soft-landing'")
15+
("joiner_annotate", po::bool_switch()->default_value(false), "include joiner annotation using 'joiner' character")
16+
("joiner", po::value<std::string>()->default_value(onmt::Tokenizer::joiner_marker), "character used to annotate joiners")
17+
("joiner_new", po::bool_switch()->default_value(false), "in joiner_annotate mode, 'joiner' is an independent token")
18+
("case_feature", po::bool_switch()->default_value(false), "lowercase corpus and generate case feature")
19+
("bpe_model", po::value<std::string>()->default_value(""), "path to the BPE model")
20+
;
21+
22+
po::variables_map vm;
23+
po::store(po::parse_command_line(argc, argv, desc), vm);
24+
po::notify(vm);
25+
26+
if (vm.count("help"))
27+
{
28+
std::cerr << desc << std::endl;
29+
return 1;
30+
}
31+
32+
onmt::ITokenizer* tokenizer = new onmt::Tokenizer(vm["mode"].as<std::string>() == "aggressive"
33+
? onmt::Tokenizer::Mode::Aggressive
34+
: onmt::Tokenizer::Mode::Conservative,
35+
vm["bpe_model"].as<std::string>(),
36+
vm["case_feature"].as<bool>(),
37+
vm["joiner_annotate"].as<bool>(),
38+
vm["joiner_new"].as<bool>(),
39+
vm["joiner"].as<std::string>());
40+
41+
std::string line;
42+
43+
while (std::getline(std::cin, line))
44+
{
45+
if (!line.empty())
46+
std::cout << tokenizer->tokenize(line);
47+
48+
std::cout << std::endl;
49+
}
50+
51+
return 0;
52+
}

include/onmt/BPE.h

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#pragma once
2+
3+
#include <string>
4+
#include <unordered_map>
5+
#include <vector>
6+
7+
namespace onmt
8+
{
9+
10+
class BPE
11+
{
12+
public:
13+
BPE(const std::string& model);
14+
15+
std::vector<std::string> encode(const std::string& str) const;
16+
17+
private:
18+
static const std::string end_of_word;
19+
20+
struct pair_hash {
21+
public:
22+
template <typename T, typename U>
23+
std::size_t operator()(const std::pair<T, U> &x) const
24+
{
25+
return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
26+
}
27+
};
28+
29+
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> _codes;
30+
31+
std::pair<std::string, std::string>
32+
get_min_pair(const std::vector<std::pair<std::string, std::string> >& pairs) const;
33+
34+
};
35+
36+
}

include/onmt/CaseModifier.h

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#pragma once
2+
3+
#include <string>
4+
5+
#include "onmt/unicode/Unicode.h"
6+
7+
namespace onmt
8+
{
9+
10+
class CaseModifier
11+
{
12+
public:
13+
static std::pair<std::string, char> extract_case(const std::string& token);
14+
static std::string apply_case(const std::string& token, char feat);
15+
16+
private:
17+
enum class Type
18+
{
19+
Lowercase,
20+
Uppercase,
21+
Mixed,
22+
Capitalized,
23+
CapitalizedFirst,
24+
None
25+
};
26+
27+
static Type update_type(Type current, unicode::_type_letter type);
28+
29+
static char type_to_char(Type type);
30+
static Type char_to_type(char feature);
31+
};
32+
33+
}

include/onmt/ITokenizer.h

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma once
2+
3+
#include <vector>
4+
#include <string>
5+
6+
namespace onmt
7+
{
8+
9+
class ITokenizer
10+
{
11+
public:
12+
static const std::string feature_marker;
13+
14+
virtual ~ITokenizer() = default;
15+
16+
virtual void tokenize(const std::string& text,
17+
std::vector<std::string>& words,
18+
std::vector<std::vector<std::string> >& features) = 0;
19+
virtual std::string detokenize(const std::vector<std::string>& words,
20+
const std::vector<std::vector<std::string> >& features) = 0;
21+
22+
// Tokenize and use spaces as token separators.
23+
virtual std::string tokenize(const std::string& text);
24+
25+
// Split the text on spaces and detokenize.
26+
virtual std::string detokenize(const std::string& text);
27+
};
28+
29+
}

include/onmt/SpaceTokenizer.h

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#pragma once
2+
3+
#include "onmt/ITokenizer.h"
4+
5+
namespace onmt
6+
{
7+
8+
// This Tokenizer simply splits on spaces. Useful when the text was tokenized
9+
// with an external tool.
10+
class SpaceTokenizer: public ITokenizer
11+
{
12+
public:
13+
static ITokenizer& get_instance();
14+
15+
void tokenize(const std::string& text,
16+
std::vector<std::string>& words,
17+
std::vector<std::vector<std::string> >& features) override;
18+
19+
std::string detokenize(const std::vector<std::string>& words,
20+
const std::vector<std::vector<std::string> >& features) override;
21+
22+
};
23+
24+
}

0 commit comments

Comments
 (0)