Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document the tokeniser #11

Merged
merged 5 commits into from
Feb 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions mathics_scanner/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@


class TranslateError(Exception):
def __init__(self):
pass
"""A generic class of tokenization errors"""
pass


class ScanError(TranslateError):
"""A generic scanning error"""
pass


class InvalidSyntaxError(TranslateError):
"""Invalid syntax"""
pass


class IncompleteSyntaxError(TranslateError):
"""More characters were expected to form a valid token"""
pass

38 changes: 35 additions & 3 deletions mathics_scanner/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,16 @@


class LineFeeder(metaclass=ABCMeta):
"""
An abstract representation for a feeder. The purpose of a feeder is to
mediate the consumption of characters between the tokeniser and the actual
file being scaned, as well to store messages regarding tokenization errors.
"""
def __init__(self, filename):
"""
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
self.messages = []
self.lineno = 0
self.filename = filename
Expand All @@ -29,13 +38,19 @@ def empty(self):
return

def message(self, sym, tag, *args):
"""
Append a generic message of type ``sym`` to the message queue.
"""
if sym == "Syntax":
message = self.syntax_message(sym, tag, *args)
else:
message = [sym, tag] + list(args)
self.messages.append(message)

def syntax_message(self, sym, tag, *args):
"""
Append a message concerning syntax errors to the message queue.
"""
if len(args) > 3:
raise ValueError("Too many args.")
message = [sym, tag]
Expand All @@ -49,16 +64,22 @@ def syntax_message(self, sym, tag, *args):
assert len(message) == 7
return message

# TODO: Rethink this (this is only usefull for core, not anyone else)
def send_messages(self, evaluation):
for message in self.messages:
evaluation.message(*message)
self.messages = []


class MultiLineFeeder(LineFeeder):
"Feeds one line at a time."
"A feeder that feeds one line at a time."

def __init__(self, lines, filename=""):
"""
@param: lines The source of the feeder (a string).
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
super(MultiLineFeeder, self).__init__(filename)
self.lineno = 0
if isinstance(lines, str):
Expand All @@ -79,9 +100,14 @@ def empty(self):


class SingleLineFeeder(LineFeeder):
"Feeds all the code as a single line."
"A feeder that feeds all the code as a single line."

def __init__(self, code, filename=""):
"""
@param: code The source of the feeder (a string).
@param: filename A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
super().__init__(filename)
self.code = code
self._empty = False
Expand All @@ -98,9 +124,14 @@ def empty(self):


class FileLineFeeder(LineFeeder):
"Feeds lines from an open file object"
"A feeder that feeds lines from an open ``File`` object"

def __init__(self, fileobject, trace_fn=None):
"""
@param: fileobject The source of the feeder (a string).
@param: filename A string that describes the source of the feeder,
i.e. the filename that is being feed.
"""
super().__init__(fileobject.name)
self.fileobject = fileobject
self.lineno = 0
Expand All @@ -122,3 +153,4 @@ def feed(self):

def empty(self):
return self.eof

71 changes: 58 additions & 13 deletions mathics_scanner/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,22 @@ def compile_tokens(token_list):


def is_symbol_name(text):
"""
Returns ``True`` if ``text`` is a valid identifier. Otherwise returns
``False``.
"""
# Can't we just call match here?
return full_symbol_pattern.sub("", text) == ""


class Token(object):
"A representation of a Wolfram Language token"
def __init__(self, tag, text, pos):
"""
@param: tag A string that indicates which type of token this is.
@param: text The actual contents of the token.
@param: pos The position of the token in the input feed.
"""
self.tag = tag
self.text = text
self.pos = pos
Expand All @@ -326,28 +337,53 @@ def __repr__(self):


class Tokeniser(object):
"""
A tokeniser for the Wolfram Language.

When subclassing ``Tokeniser``, custom tokenisation rules can be defined by
Copy link
Contributor Author

@GarkGarcia GarkGarcia Feb 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this mechanism should be exposed in the public API (and therefore it shouldn't be documented in the docstring). If you think about it, all consumers of this library want is to have a functioning WL tokeniser that they can use as a black-box (that's what I think at-least).

This definitively should be documented somewhere though. @rocky I'd apprciate if we could merge #8 before this, so that I can move this information to implementation.rst. I also plan to convert implementation.rst and the rest of the documentation to a proper Sphinx document before we release the library (which should be pretty easy to do, so it's not gonna take too much time).

declaring methods whose names are preceded by ``t_``, such as in the
following example: ::

class MyTokeniser(Tokeniser):
def t_MyWeirdRule(self, match):
# Your logic goes here...
pass

In this example, ``t_MyWeirdRule`` is supposed to update the internal state
of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch``
is expected to be an instance of ``re.Match``.
"""
modes = {
"expr": (tokens, token_indices),
"filename": (filename_tokens, {}),
}

def __init__(self, feeder):
"""
@param: feeder An instance of ``LineFeeder`` which will feed characters
to the tokeniser.
"""
self.pos = 0
self.feeder = feeder
self.prescanner = Prescanner(feeder)
self.code = self.prescanner.scan()
self.change_mode("expr")
self._change_mode("expr")

def change_mode(self, mode):
def _change_mode(self, mode):
"""
Set the mode of the tokeniser
"""
self.mode = mode
self.tokens, self.token_indices = self.modes[mode]

# TODO: Rename this to something that remotetly makes sense?
def incomplete(self):
"get more code from the prescanner and continue"
"Get more code from the prescanner and continue"
self.prescanner.incomplete()
self.code += self.prescanner.scan()

def sntx_message(self, pos=None):
"""Send a message to the feeder."""
if pos is None:
pos = self.pos
pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
Expand All @@ -356,9 +392,10 @@ def sntx_message(self, pos=None):
else:
self.feeder.message("Syntax", "sntxf", pre, post)

# TODO: Convert this to __next__ in the future?
def next(self):
"return next token"
self.skip_blank()
"Returns the next token"
self._skip_blank()
if self.pos >= len(self.code):
return Token("END", "", len(self.code))

Expand Down Expand Up @@ -390,8 +427,8 @@ def next(self):
self.pos = match.end(0)
return Token(tag, text, match.start(0))

def skip_blank(self):
"skip whitespace and comments"
def _skip_blank(self):
"Skip whitespace and comments"
comment = [] # start positions of comments
while True:
if self.pos >= len(self.code):
Expand All @@ -417,6 +454,7 @@ def skip_blank(self):
break

def t_String(self, match):
"String rule"
start, end = self.pos, None
self.pos += 1 # skip opening '"'
newlines = []
Expand Down Expand Up @@ -444,6 +482,7 @@ def t_String(self, match):
return Token("String", result, start)

def t_Number(self, match):
"Number rule"
text = match.group(0)
pos = match.end(0)
if self.code[pos - 1 : pos + 1] == "..":
Expand All @@ -454,21 +493,27 @@ def t_Number(self, match):
self.pos = pos
return Token("Number", text, match.start(0))

def token_mode(self, match, tag, mode):
# This isn't outside of here so it's considered internal
def _token_mode(self, match, tag, mode):
"consume a token and switch mode"
text = match.group(0)
self.pos = match.end(0)
self.change_mode(mode)
self._change_mode(mode)
return Token(tag, text, match.start(0))

def t_Get(self, match):
return self.token_mode(match, "Get", "filename")
"Get rule"
return self._token_mode(match, "Get", "filename")

def t_Put(self, match):
return self.token_mode(match, "Put", "filename")
"Put rule"
return self._token_mode(match, "Put", "filename")

def t_PutAppend(self, match):
return self.token_mode(match, "PutAppend", "filename")
"PutAppend rule"
return self._token_mode(match, "PutAppend", "filename")

def t_Filename(self, match):
return self.token_mode(match, "Filename", "expr")
"Filename rule"
return self._token_mode(match, "Filename", "expr")