From 4bebfab9f24b8fb887415beac95bc456b6f848bb Mon Sep 17 00:00:00 2001 From: Scott Ivey Date: Thu, 19 Nov 2015 23:05:08 -0500 Subject: [PATCH] recognize GENERAL_PUNCTUATION unicode points as non-letters --- src/libunicode/code_point_support.cpp | 4 ++- .../test_unit/test_UnicodeBlock.cpp | 26 +++++++++++++++- .../test_unit/test_code_point_support.cpp | 31 ++++++++++++++++--- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/src/libunicode/code_point_support.cpp b/src/libunicode/code_point_support.cpp index bb1e9a3..48b2ba6 100644 --- a/src/libunicode/code_point_support.cpp +++ b/src/libunicode/code_point_support.cpp @@ -95,7 +95,8 @@ bool isLatin1SupplementLetter(uint32_t cp) { if (cp == 215 || cp == 247) { return false; } - return false; + + return true; } bool isMathematicalAlphanumericSymbolLetter(uint32_t cp) { @@ -114,6 +115,7 @@ bool isLetterPoint(uint32_t cp, UnicodeBlock uBlock) { case UnicodeBlock::LATIN_EXTENDED_D : return true; case UnicodeBlock::LATIN_EXTENDED_E : return true; case UnicodeBlock::LATIN_EXTENDED_ADDITIONAL : return true; + case UnicodeBlock::GENERAL_PUNCTUATION : return false; case UnicodeBlock::ARROWS : return false; case UnicodeBlock::BLOCK_ELEMENTS : return false; diff --git a/src/libunicode/test_unit/test_UnicodeBlock.cpp b/src/libunicode/test_unit/test_UnicodeBlock.cpp index 6fe359c..6f81afb 100644 --- a/src/libunicode/test_unit/test_UnicodeBlock.cpp +++ b/src/libunicode/test_unit/test_UnicodeBlock.cpp @@ -7,10 +7,34 @@ using namespace relevanced; using namespace relevanced::libunicode; using namespace relevanced::util; -TEST(TestGetUnicodeBlock, Simple) { +TEST(TestGetUnicodeBlock, BasicLatin) { uint32_t codepoint = 65; // 'A' EXPECT_EQ( UnicodeBlock::BASIC_LATIN, getUnicodeBlock(codepoint) ); } + +TEST(TestGetUnicodeBlock, GeneralPunctuation) { + uint32_t codepoint = 8212; // unicode long dash + EXPECT_EQ( + UnicodeBlock::GENERAL_PUNCTUATION, + getUnicodeBlock(codepoint) + ); +} + +TEST(TestGetUnicodeBlock, Latin1Supplement) { + uint32_t codepoint = 250; // u with accent + EXPECT_EQ( + UnicodeBlock::LATIN_1_SUPPLEMENT, + getUnicodeBlock(codepoint) + ); +} + +TEST(TestGetUnicodeBlock, LatinExtendedA) { + uint32_t codepoint = 312; // an adorably small capital letter K + EXPECT_EQ( + UnicodeBlock::LATIN_EXTENDED_A, + getUnicodeBlock(codepoint) + ); +} \ No newline at end of file diff --git a/src/libunicode/test_unit/test_code_point_support.cpp b/src/libunicode/test_unit/test_code_point_support.cpp index 1b71a6d..683f15f 100644 --- a/src/libunicode/test_unit/test_code_point_support.cpp +++ b/src/libunicode/test_unit/test_code_point_support.cpp @@ -7,11 +7,32 @@ using namespace relevanced; using namespace relevanced::libunicode; using namespace relevanced::util; -TEST(TestCodePointSupport, Simple) { - uint32_t codepoint = (uint32_t) ((unsigned char) 'a'); - EXPECT_TRUE(isLetterPoint(codepoint)); +TEST(TestIsLetterPoint, Simple) { + uint32_t codePoint = (uint32_t) ((unsigned char) 'a'); + EXPECT_TRUE(isLetterPoint(codePoint)); - codepoint = (uint32_t) ((unsigned char) '.'); - EXPECT_FALSE(isLetterPoint(codepoint)); + codePoint = (uint32_t) ((unsigned char) '.'); + EXPECT_FALSE(isLetterPoint(codePoint)); +} + +TEST(TestIsLetterPoint, UnicodePunctuation) { + uint32_t codePoint = 8212; // unicode long dash + EXPECT_FALSE(isLetterPoint(codePoint)); +} + +TEST(TestIsLetterPoint, Numbers) { + for (unsigned char c = 0; c < 10; c++) { + uint32_t codePoint = c; + EXPECT_FALSE(isLetterPoint(codePoint)); + } +} + +TEST(TestIsLetterPoint, GermanLetter) { + uint32_t codePoint = 223; // Eszett, the big funny B + EXPECT_TRUE(isLetterPoint(codePoint)); +} +TEST(TestIsLetterPoint, SpanishLetter) { + uint32_t codePoint = 209; // capital ene (n with tilde) + EXPECT_TRUE(isLetterPoint(codePoint)); }