Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Internal merge
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 181811258
  • Loading branch information
Ryan Sepassi committed Jan 13, 2018
1 parent 164d175 commit c7f24da
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
18 changes: 10 additions & 8 deletions tensor2tensor/utils/bleu_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,14 @@ class UnicodeRegex(object):
"""Ad-hoc hack to recognize all punctuation and symbols."""

def __init__(self):
def _property_chars(prefix):
return "".join(six.unichr(x) for x in range(sys.maxunicode)
if unicodedata.category(six.unichr(x)).startswith(prefix))
punctuation = self._property_chars("P")
punctuation = self.property_chars("P")
self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
self.symbol_re = re.compile("([" + _property_chars("S") + "])")
self.symbol_re = re.compile("([" + self.property_chars("S") + "])")

def property_chars(self, prefix):
return "".join(six.unichr(x) for x in range(sys.maxunicode)
if unicodedata.category(six.unichr(x)).startswith(prefix))


def bleu_tokenize(string):
Expand All @@ -182,9 +183,10 @@ def bleu_tokenize(string):
Returns:
a list of tokens
"""
string = UnicodeRegex.nondigit_punct_re.sub(r"\1 \2 ", string)
string = UnicodeRegex.punct_nondigit_re.sub(r" \1 \2", string)
string = UnicodeRegex.symbol_re.sub(r" \1 ", string)
uregex = UnicodeRegex()
string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
string = uregex.symbol_re.sub(r" \1 ", string)
return string.split()


Expand Down
8 changes: 7 additions & 1 deletion tensor2tensor/utils/bleu_hook_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# coding=utf-8
"""Tests for tensor2tensor.utils.bleu_hook."""

from __future__ import absolute_import
Expand Down Expand Up @@ -57,5 +58,10 @@ def testComputeMultipleNgrams(self):
actual_bleu = 0.3436
self.assertAllClose(bleu, actual_bleu, atol=1e-03)

if __name__ == '__main__':
def testBleuTokenize(self):
self.assertEqual(bleu_hook.bleu_tokenize(u"hi, “there”"),
[u"hi", u",", u"“", u"there", u"”"])


if __name__ == "__main__":
tf.test.main()

0 comments on commit c7f24da

Please sign in to comment.