From 6ef9a0bf068c4be258339d37a76e2c2fc5681bc9 Mon Sep 17 00:00:00 2001 From: Pablo Emilio Escobar Gaviria Date: Mon, 1 Feb 2021 20:34:12 +0000 Subject: [PATCH 1/5] Documented stuff used by Mathics --- mathics_scanner/errors.py | 8 +++-- mathics_scanner/tokeniser.py | 62 +++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 4ef27fb..438ea3b 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -3,17 +3,21 @@ class TranslateError(Exception): - def __init__(self): - pass + """A generic class of tokenizing errors""" + pass class ScanError(TranslateError): + """A generic scanning error""" pass class InvalidSyntaxError(TranslateError): + """Invalid syntax""" pass class IncompleteSyntaxError(TranslateError): + """More characters were expected to form a valid token""" pass + diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index a2bbeca..9717845 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -305,11 +305,22 @@ def compile_tokens(token_list): def is_symbol_name(text): + """ + Returns ``True`` if ``text`` is a valid identifier. Otherwise returns + ``False``. + """ + # Can't we just call match here? return full_symbol_pattern.sub("", text) == "" class Token(object): + "A representation of a Wolfram Language token" def __init__(self, tag, text, pos): + """ + @param: tag A string that indicates which type of token this is. + @param: text The actual contents of the token. + @param: pos The position of the token in the input feed. + """ self.tag = tag self.text = text self.pos = pos @@ -326,28 +337,54 @@ def __repr__(self): class Tokeniser(object): + """ + A tokenizer for the Wolfram Language. + + When subclassing ``Tokeniser``, custom tokenisation rules can be defined by + declaring methods whose names are preceded by ``t_``, such as in the + following example: :: + + class MyTokeniser(Tokeniser): + def t_MyWeirdRule(self, match): + # Your logic goes here... + pass + + In this example, ``t_MyWeirdRule`` is supposed to update the internal state + of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch`` + is expected to be an instance of ``re.Match``. + """ modes = { "expr": (tokens, token_indices), "filename": (filename_tokens, {}), } def __init__(self, feeder): + """ + @param: feeder An instance of ``LineFeeder`` which will feed characters + to the tokenizer. + """ self.pos = 0 self.feeder = feeder self.prescanner = Prescanner(feeder) self.code = self.prescanner.scan() self.change_mode("expr") + # TODO: Turn this into a setter in the future? def change_mode(self, mode): + """ + Set the mode of the tokenizer + """ self.mode = mode self.tokens, self.token_indices = self.modes[mode] + # TODO: Rename this to something that remotetly makes sense? def incomplete(self): - "get more code from the prescanner and continue" + "Get more code from the prescanner and continue" self.prescanner.incomplete() self.code += self.prescanner.scan() def sntx_message(self, pos=None): + """Send a message to the feeder.""" if pos is None: pos = self.pos pre, post = self.code[:pos], self.code[pos:].rstrip("\n") @@ -356,8 +393,9 @@ def sntx_message(self, pos=None): else: self.feeder.message("Syntax", "sntxf", pre, post) + # TODO: Convert this to __next__ in the future? def next(self): - "return next token" + "Returns the next token" self.skip_blank() if self.pos >= len(self.code): return Token("END", "", len(self.code)) @@ -391,7 +429,7 @@ def next(self): return Token(tag, text, match.start(0)) def skip_blank(self): - "skip whitespace and comments" + "Skip whitespace and comments" comment = [] # start positions of comments while True: if self.pos >= len(self.code): @@ -417,6 +455,7 @@ def skip_blank(self): break def t_String(self, match): + "``String`` tokenizer" start, end = self.pos, None self.pos += 1 # skip opening '"' newlines = [] @@ -444,6 +483,7 @@ def t_String(self, match): return Token("String", result, start) def t_Number(self, match): + "Number tag" text = match.group(0) pos = match.end(0) if self.code[pos - 1 : pos + 1] == "..": @@ -454,7 +494,8 @@ def t_Number(self, match): self.pos = pos return Token("Number", text, match.start(0)) - def token_mode(self, match, tag, mode): + # This isn't outside of here so it's considered internal + def _token_mode(self, match, tag, mode): "consume a token and switch mode" text = match.group(0) self.pos = match.end(0) @@ -462,13 +503,18 @@ def token_mode(self, match, tag, mode): return Token(tag, text, match.start(0)) def t_Get(self, match): - return self.token_mode(match, "Get", "filename") + "Get tag" + return self._token_mode(match, "Get", "filename") def t_Put(self, match): - return self.token_mode(match, "Put", "filename") + "Put tag" + return self._token_mode(match, "Put", "filename") def t_PutAppend(self, match): - return self.token_mode(match, "PutAppend", "filename") + "PutAppend tag" + return self._token_mode(match, "PutAppend", "filename") def t_Filename(self, match): - return self.token_mode(match, "Filename", "expr") + "Filename tag" + return self._token_mode(match, "Filename", "expr") + From cbfdd040a7b80bb55a65473b63720a93ad65ec22 Mon Sep 17 00:00:00 2001 From: Pablo Emilio Escobar Gaviria Date: Mon, 1 Feb 2021 21:04:22 +0000 Subject: [PATCH 2/5] Fixed typos --- mathics_scanner/tokeniser.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 9717845..c6fb01e 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -338,7 +338,7 @@ def __repr__(self): class Tokeniser(object): """ - A tokenizer for the Wolfram Language. + A tokeniser for the Wolfram Language. When subclassing ``Tokeniser``, custom tokenisation rules can be defined by declaring methods whose names are preceded by ``t_``, such as in the @@ -361,7 +361,7 @@ def t_MyWeirdRule(self, match): def __init__(self, feeder): """ @param: feeder An instance of ``LineFeeder`` which will feed characters - to the tokenizer. + to the tokeniser. """ self.pos = 0 self.feeder = feeder @@ -372,7 +372,7 @@ def __init__(self, feeder): # TODO: Turn this into a setter in the future? def change_mode(self, mode): """ - Set the mode of the tokenizer + Set the mode of the tokeniser """ self.mode = mode self.tokens, self.token_indices = self.modes[mode] @@ -455,7 +455,7 @@ def skip_blank(self): break def t_String(self, match): - "``String`` tokenizer" + "String rule" start, end = self.pos, None self.pos += 1 # skip opening '"' newlines = [] @@ -483,7 +483,7 @@ def t_String(self, match): return Token("String", result, start) def t_Number(self, match): - "Number tag" + "Number rule" text = match.group(0) pos = match.end(0) if self.code[pos - 1 : pos + 1] == "..": @@ -503,18 +503,18 @@ def _token_mode(self, match, tag, mode): return Token(tag, text, match.start(0)) def t_Get(self, match): - "Get tag" + "Get rule" return self._token_mode(match, "Get", "filename") def t_Put(self, match): - "Put tag" + "Put rule" return self._token_mode(match, "Put", "filename") def t_PutAppend(self, match): - "PutAppend tag" + "PutAppend rule" return self._token_mode(match, "PutAppend", "filename") def t_Filename(self, match): - "Filename tag" + "Filename rule" return self._token_mode(match, "Filename", "expr") From 03ea0c604fedee428a624d52cbe926ab350aa336 Mon Sep 17 00:00:00 2001 From: Pablo Emilio Escobar Gaviria Date: Tue, 2 Feb 2021 13:04:24 +0000 Subject: [PATCH 3/5] Fixed typo --- mathics_scanner/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 438ea3b..ac8c0c7 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -3,7 +3,7 @@ class TranslateError(Exception): - """A generic class of tokenizing errors""" + """A generic class of tokenization errors""" pass From 73e8a43eca50299d1aa1e6b7aedc6d2f853653fc Mon Sep 17 00:00:00 2001 From: Pablo Emilio Escobar Gaviria Date: Tue, 2 Feb 2021 13:19:23 +0000 Subject: [PATCH 4/5] Documented the feeders --- mathics_scanner/feed.py | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index 07c5251..6714b64 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -8,7 +8,16 @@ class LineFeeder(metaclass=ABCMeta): + """ + An abstract representation for a feeder. The purpose of a feeder is to + mediate the consumption of characters between the tokeniser and the actual + file being scaned, as well to store messages regarding tokenization errors. + """ def __init__(self, filename): + """ + @param: filename A string that describes the source of the feeder, i.e. + the filename that is being feed. + """ self.messages = [] self.lineno = 0 self.filename = filename @@ -29,6 +38,9 @@ def empty(self): return def message(self, sym, tag, *args): + """ + Append a generic message of type ``sym`` to the message queue. + """ if sym == "Syntax": message = self.syntax_message(sym, tag, *args) else: @@ -36,6 +48,9 @@ def message(self, sym, tag, *args): self.messages.append(message) def syntax_message(self, sym, tag, *args): + """ + Append a message concerning syntax errors to the message queue. + """ if len(args) > 3: raise ValueError("Too many args.") message = [sym, tag] @@ -49,6 +64,7 @@ def syntax_message(self, sym, tag, *args): assert len(message) == 7 return message + # TODO: Rethink this (this is only usefull for core, not anyone else) def send_messages(self, evaluation): for message in self.messages: evaluation.message(*message) @@ -56,9 +72,14 @@ def send_messages(self, evaluation): class MultiLineFeeder(LineFeeder): - "Feeds one line at a time." + "A feeder that feeds one line at a time." def __init__(self, lines, filename=""): + """ + @param: lines The source of the feeder (a string). + @param: filename A string that describes the source of the feeder, i.e. + the filename that is being feed. + """ super(MultiLineFeeder, self).__init__(filename) self.lineno = 0 if isinstance(lines, str): @@ -79,9 +100,14 @@ def empty(self): class SingleLineFeeder(LineFeeder): - "Feeds all the code as a single line." + "A feeder that feeds all the code as a single line." def __init__(self, code, filename=""): + """ + @param: code The source of the feeder (a string). + @param: filename A string that describes the source of the feeder, i.e. + the filename that is being feed. + """ super().__init__(filename) self.code = code self._empty = False @@ -98,9 +124,14 @@ def empty(self): class FileLineFeeder(LineFeeder): - "Feeds lines from an open file object" + "A feeder that feeds lines from an open ``File`` object" def __init__(self, fileobject, trace_fn=None): + """ + @param: fileobject The source of the feeder (a string). + @param: filename A string that describes the source of the feeder, + i.e. the filename that is being feed. + """ super().__init__(fileobject.name) self.fileobject = fileobject self.lineno = 0 @@ -122,3 +153,4 @@ def feed(self): def empty(self): return self.eof + From dbae4b3e0fe5c1a230724ce6e30cbf353e04f2ba Mon Sep 17 00:00:00 2001 From: Pablo Emilio Escobar Gaviria Date: Tue, 2 Feb 2021 13:54:38 +0000 Subject: [PATCH 5/5] Marked adtional methods of Tokenise as private This methods are only useful internally and are not used by core anywhere --- mathics_scanner/tokeniser.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index c6fb01e..958efa2 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -367,10 +367,9 @@ def __init__(self, feeder): self.feeder = feeder self.prescanner = Prescanner(feeder) self.code = self.prescanner.scan() - self.change_mode("expr") + self._change_mode("expr") - # TODO: Turn this into a setter in the future? - def change_mode(self, mode): + def _change_mode(self, mode): """ Set the mode of the tokeniser """ @@ -396,7 +395,7 @@ def sntx_message(self, pos=None): # TODO: Convert this to __next__ in the future? def next(self): "Returns the next token" - self.skip_blank() + self._skip_blank() if self.pos >= len(self.code): return Token("END", "", len(self.code)) @@ -428,7 +427,7 @@ def next(self): self.pos = match.end(0) return Token(tag, text, match.start(0)) - def skip_blank(self): + def _skip_blank(self): "Skip whitespace and comments" comment = [] # start positions of comments while True: @@ -499,7 +498,7 @@ def _token_mode(self, match, tag, mode): "consume a token and switch mode" text = match.group(0) self.pos = match.end(0) - self.change_mode(mode) + self._change_mode(mode) return Token(tag, text, match.start(0)) def t_Get(self, match):