Skip to content

Commit

Permalink
Merge pull request #11 from Mathics3/tokenizer-documentation
Browse files Browse the repository at this point in the history
Document the tokeniser
  • Loading branch information
rocky authored Feb 6, 2021
2 parents 295fe97 + dbae4b3 commit 656a7cc
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 18 deletions.
8 changes: 6 additions & 2 deletions mathics_scanner/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@


class TranslateError(Exception):
def __init__(self):
pass
"""A generic class of tokenization errors"""
pass


class ScanError(TranslateError):
"""A generic scanning error"""
pass


class InvalidSyntaxError(TranslateError):
"""Invalid syntax"""
pass


class IncompleteSyntaxError(TranslateError):
"""More characters were expected to form a valid token"""
pass

38 changes: 35 additions & 3 deletions mathics_scanner/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,16 @@


class LineFeeder(metaclass=ABCMeta):
"""
An abstract representation for a feeder. The purpose of a feeder is to
mediate the consumption of characters between the tokeniser and the actual
file being scaned, as well to store messages regarding tokenization errors.
"""
def __init__(self, filename):
"""
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
self.messages = []
self.lineno = 0
self.filename = filename
Expand All @@ -29,13 +38,19 @@ def empty(self):
return

def message(self, sym, tag, *args):
"""
Append a generic message of type ``sym`` to the message queue.
"""
if sym == "Syntax":
message = self.syntax_message(sym, tag, *args)
else:
message = [sym, tag] + list(args)
self.messages.append(message)

def syntax_message(self, sym, tag, *args):
"""
Append a message concerning syntax errors to the message queue.
"""
if len(args) > 3:
raise ValueError("Too many args.")
message = [sym, tag]
Expand All @@ -49,16 +64,22 @@ def syntax_message(self, sym, tag, *args):
assert len(message) == 7
return message

# TODO: Rethink this (this is only usefull for core, not anyone else)
def send_messages(self, evaluation):
for message in self.messages:
evaluation.message(*message)
self.messages = []


class MultiLineFeeder(LineFeeder):
"Feeds one line at a time."
"A feeder that feeds one line at a time."

def __init__(self, lines, filename=""):
"""
@param: lines The source of the feeder (a string).
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
super(MultiLineFeeder, self).__init__(filename)
self.lineno = 0
if isinstance(lines, str):
Expand All @@ -79,9 +100,14 @@ def empty(self):


class SingleLineFeeder(LineFeeder):
"Feeds all the code as a single line."
"A feeder that feeds all the code as a single line."

def __init__(self, code, filename=""):
"""
@param: code The source of the feeder (a string).
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
super().__init__(filename)
self.code = code
self._empty = False
Expand All @@ -98,9 +124,14 @@ def empty(self):


class FileLineFeeder(LineFeeder):
"Feeds lines from an open file object"
"A feeder that feeds lines from an open ``File`` object"

def __init__(self, fileobject, trace_fn=None):
"""
@param: fileobject The source of the feeder (a string).
@param: filename A string that describes the source of the feeder,
i.e. the filename that is being feed.
"""
super().__init__(fileobject.name)
self.fileobject = fileobject
self.lineno = 0
Expand All @@ -122,3 +153,4 @@ def feed(self):

def empty(self):
return self.eof

71 changes: 58 additions & 13 deletions mathics_scanner/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,22 @@ def compile_tokens(token_list):


def is_symbol_name(text):
"""
Returns ``True`` if ``text`` is a valid identifier. Otherwise returns
``False``.
"""
# Can't we just call match here?
return full_symbol_pattern.sub("", text) == ""


class Token(object):
"A representation of a Wolfram Language token"
def __init__(self, tag, text, pos):
"""
@param: tag A string that indicates which type of token this is.
@param: text The actual contents of the token.
@param: pos The position of the token in the input feed.
"""
self.tag = tag
self.text = text
self.pos = pos
Expand All @@ -326,28 +337,53 @@ def __repr__(self):


class Tokeniser(object):
"""
A tokeniser for the Wolfram Language.
When subclassing ``Tokeniser``, custom tokenisation rules can be defined by
declaring methods whose names are preceded by ``t_``, such as in the
following example: ::
class MyTokeniser(Tokeniser):
def t_MyWeirdRule(self, match):
# Your logic goes here...
pass
In this example, ``t_MyWeirdRule`` is supposed to update the internal state
of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch``
is expected to be an instance of ``re.Match``.
"""
modes = {
"expr": (tokens, token_indices),
"filename": (filename_tokens, {}),
}

def __init__(self, feeder):
"""
@param: feeder An instance of ``LineFeeder`` which will feed characters
to the tokeniser.
"""
self.pos = 0
self.feeder = feeder
self.prescanner = Prescanner(feeder)
self.code = self.prescanner.scan()
self.change_mode("expr")
self._change_mode("expr")

def change_mode(self, mode):
def _change_mode(self, mode):
"""
Set the mode of the tokeniser
"""
self.mode = mode
self.tokens, self.token_indices = self.modes[mode]

# TODO: Rename this to something that remotetly makes sense?
def incomplete(self):
"get more code from the prescanner and continue"
"Get more code from the prescanner and continue"
self.prescanner.incomplete()
self.code += self.prescanner.scan()

def sntx_message(self, pos=None):
"""Send a message to the feeder."""
if pos is None:
pos = self.pos
pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
Expand All @@ -356,9 +392,10 @@ def sntx_message(self, pos=None):
else:
self.feeder.message("Syntax", "sntxf", pre, post)

# TODO: Convert this to __next__ in the future?
def next(self):
"return next token"
self.skip_blank()
"Returns the next token"
self._skip_blank()
if self.pos >= len(self.code):
return Token("END", "", len(self.code))

Expand Down Expand Up @@ -390,8 +427,8 @@ def next(self):
self.pos = match.end(0)
return Token(tag, text, match.start(0))

def skip_blank(self):
"skip whitespace and comments"
def _skip_blank(self):
"Skip whitespace and comments"
comment = [] # start positions of comments
while True:
if self.pos >= len(self.code):
Expand All @@ -417,6 +454,7 @@ def skip_blank(self):
break

def t_String(self, match):
"String rule"
start, end = self.pos, None
self.pos += 1 # skip opening '"'
newlines = []
Expand Down Expand Up @@ -444,6 +482,7 @@ def t_String(self, match):
return Token("String", result, start)

def t_Number(self, match):
"Number rule"
text = match.group(0)
pos = match.end(0)
if self.code[pos - 1 : pos + 1] == "..":
Expand All @@ -454,21 +493,27 @@ def t_Number(self, match):
self.pos = pos
return Token("Number", text, match.start(0))

def token_mode(self, match, tag, mode):
# This isn't outside of here so it's considered internal
def _token_mode(self, match, tag, mode):
"consume a token and switch mode"
text = match.group(0)
self.pos = match.end(0)
self.change_mode(mode)
self._change_mode(mode)
return Token(tag, text, match.start(0))

def t_Get(self, match):
return self.token_mode(match, "Get", "filename")
"Get rule"
return self._token_mode(match, "Get", "filename")

def t_Put(self, match):
return self.token_mode(match, "Put", "filename")
"Put rule"
return self._token_mode(match, "Put", "filename")

def t_PutAppend(self, match):
return self.token_mode(match, "PutAppend", "filename")
"PutAppend rule"
return self._token_mode(match, "PutAppend", "filename")

def t_Filename(self, match):
return self.token_mode(match, "Filename", "expr")
"Filename rule"
return self._token_mode(match, "Filename", "expr")

0 comments on commit 656a7cc

Please sign in to comment.