Merge pull request #11 from Mathics3/tokenizer-documentation

Document the tokeniser
Mathics3 · Feb 6, 2021 · 656a7cc · 656a7cc
2 parents 295fe97 + dbae4b3
commit 656a7cc
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 18 deletions.
diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py
@@ -3,17 +3,21 @@
 
 
 class TranslateError(Exception):
-    def __init__(self):
-        pass
+    """A generic class of tokenization errors"""
+    pass
 
 
 class ScanError(TranslateError):
+    """A generic scanning error"""
     pass
 
 
 class InvalidSyntaxError(TranslateError):
+    """Invalid syntax"""
     pass
 
 
 class IncompleteSyntaxError(TranslateError):
+    """More characters were expected to form a valid token"""
     pass
+
diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
@@ -8,7 +8,16 @@
 
 
 class LineFeeder(metaclass=ABCMeta):
+    """
+    An abstract representation for a feeder. The purpose of a feeder is to 
+    mediate the consumption of characters between the tokeniser and the actual 
+    file being scaned, as well to store messages regarding tokenization errors.
+    """
     def __init__(self, filename):
+        """
+        @param: filename A string that describes the source of the feeder, i.e.
+                         the filename that is being feed.
+        """
         self.messages = []
         self.lineno = 0
         self.filename = filename
@@ -29,13 +38,19 @@ def empty(self):
         return
 
     def message(self, sym, tag, *args):
+        """
+        Append a generic message of type ``sym`` to the message queue.
+        """
         if sym == "Syntax":
             message = self.syntax_message(sym, tag, *args)
         else:
             message = [sym, tag] + list(args)
         self.messages.append(message)
 
     def syntax_message(self, sym, tag, *args):
+        """
+        Append a message concerning syntax errors to the message queue.
+        """
         if len(args) > 3:
             raise ValueError("Too many args.")
         message = [sym, tag]
@@ -49,16 +64,22 @@ def syntax_message(self, sym, tag, *args):
         assert len(message) == 7
         return message
 
+    # TODO: Rethink this (this is only usefull for core, not anyone else)
     def send_messages(self, evaluation):
         for message in self.messages:
             evaluation.message(*message)
         self.messages = []
 
 
 class MultiLineFeeder(LineFeeder):
-    "Feeds one line at a time."
+    "A feeder that feeds one line at a time."
 
     def __init__(self, lines, filename=""):
+        """
+        @param: lines    The source of the feeder (a string).
+        @param: filename A string that describes the source of the feeder, i.e.
+                         the filename that is being feed.
+        """
         super(MultiLineFeeder, self).__init__(filename)
         self.lineno = 0
         if isinstance(lines, str):
@@ -79,9 +100,14 @@ def empty(self):
 
 
 class SingleLineFeeder(LineFeeder):
-    "Feeds all the code as a single line."
+    "A feeder that feeds all the code as a single line."
 
     def __init__(self, code, filename=""):
+        """
+        @param: code     The source of the feeder (a string).
+        @param: filename A string that describes the source of the feeder, i.e.
+                         the filename that is being feed.
+        """
         super().__init__(filename)
         self.code = code
         self._empty = False
@@ -98,9 +124,14 @@ def empty(self):
 
 
 class FileLineFeeder(LineFeeder):
-    "Feeds lines from an open file object"
+    "A feeder that feeds lines from an open ``File`` object"
 
     def __init__(self, fileobject, trace_fn=None):
+        """
+        @param: fileobject The source of the feeder (a string).
+        @param: filename   A string that describes the source of the feeder,
+                           i.e.  the filename that is being feed.
+        """
         super().__init__(fileobject.name)
         self.fileobject = fileobject
         self.lineno = 0
@@ -122,3 +153,4 @@ def feed(self):
 
     def empty(self):
         return self.eof
+
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
@@ -305,11 +305,22 @@ def compile_tokens(token_list):
 
 
 def is_symbol_name(text):
+    """
+    Returns ``True`` if ``text`` is a valid identifier. Otherwise returns
+    ``False``.
+    """
+    # Can't we just call match here?
     return full_symbol_pattern.sub("", text) == ""
 
 
 class Token(object):
+    "A representation of a Wolfram Language token"
     def __init__(self, tag, text, pos):
+        """
+        @param: tag  A string that indicates which type of token this is.
+        @param: text The actual contents of the token.
+        @param: pos  The position of the token in the input feed.
+        """
         self.tag = tag
         self.text = text
         self.pos = pos
@@ -326,28 +337,53 @@ def __repr__(self):
 
 
 class Tokeniser(object):
+    """
+    A tokeniser for the Wolfram Language.
+
+    When subclassing ``Tokeniser``, custom tokenisation rules can be defined by 
+    declaring methods whose names are preceded by ``t_``, such as in the 
+    following example: ::
+
+        class MyTokeniser(Tokeniser):
+            def t_MyWeirdRule(self, match):
+                # Your logic goes here...
+                pass
+
+    In this example, ``t_MyWeirdRule`` is supposed to update the internal state 
+    of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch`` 
+    is expected to be an instance of ``re.Match``.
+    """
     modes = {
         "expr": (tokens, token_indices),
         "filename": (filename_tokens, {}),
     }
 
     def __init__(self, feeder):
+        """
+        @param: feeder An instance of ``LineFeeder`` which will feed characters
+                       to the tokeniser.
+        """
         self.pos = 0
         self.feeder = feeder
         self.prescanner = Prescanner(feeder)
         self.code = self.prescanner.scan()
-        self.change_mode("expr")
+        self._change_mode("expr")
 
-    def change_mode(self, mode):
+    def _change_mode(self, mode):
+        """
+        Set the mode of the tokeniser
+        """
         self.mode = mode
         self.tokens, self.token_indices = self.modes[mode]
 
+    # TODO: Rename this to something that remotetly makes sense?
     def incomplete(self):
-        "get more code from the prescanner and continue"
+        "Get more code from the prescanner and continue"
         self.prescanner.incomplete()
         self.code += self.prescanner.scan()
 
     def sntx_message(self, pos=None):
+        """Send a message to the feeder."""
         if pos is None:
             pos = self.pos
         pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
@@ -356,9 +392,10 @@ def sntx_message(self, pos=None):
         else:
             self.feeder.message("Syntax", "sntxf", pre, post)
 
+    # TODO: Convert this to __next__ in the future?
     def next(self):
-        "return next token"
-        self.skip_blank()
+        "Returns the next token"
+        self._skip_blank()
         if self.pos >= len(self.code):
             return Token("END", "", len(self.code))
 
@@ -390,8 +427,8 @@ def next(self):
             self.pos = match.end(0)
             return Token(tag, text, match.start(0))
 
-    def skip_blank(self):
-        "skip whitespace and comments"
+    def _skip_blank(self):
+        "Skip whitespace and comments"
         comment = []  # start positions of comments
         while True:
             if self.pos >= len(self.code):
@@ -417,6 +454,7 @@ def skip_blank(self):
                 break
 
     def t_String(self, match):
+        "String rule"
         start, end = self.pos, None
         self.pos += 1  # skip opening '"'
         newlines = []
@@ -444,6 +482,7 @@ def t_String(self, match):
         return Token("String", result, start)
 
     def t_Number(self, match):
+        "Number rule"
         text = match.group(0)
         pos = match.end(0)
         if self.code[pos - 1 : pos + 1] == "..":
@@ -454,21 +493,27 @@ def t_Number(self, match):
             self.pos = pos
         return Token("Number", text, match.start(0))
 
-    def token_mode(self, match, tag, mode):
+    # This isn't outside of here so it's considered internal
+    def _token_mode(self, match, tag, mode):
         "consume a token and switch mode"
         text = match.group(0)
         self.pos = match.end(0)
-        self.change_mode(mode)
+        self._change_mode(mode)
         return Token(tag, text, match.start(0))
 
     def t_Get(self, match):
-        return self.token_mode(match, "Get", "filename")
+        "Get rule"
+        return self._token_mode(match, "Get", "filename")
 
     def t_Put(self, match):
-        return self.token_mode(match, "Put", "filename")
+        "Put rule"
+        return self._token_mode(match, "Put", "filename")
 
     def t_PutAppend(self, match):
-        return self.token_mode(match, "PutAppend", "filename")
+        "PutAppend rule"
+        return self._token_mode(match, "PutAppend", "filename")
 
     def t_Filename(self, match):
-        return self.token_mode(match, "Filename", "expr")
+        "Filename rule"
+        return self._token_mode(match, "Filename", "expr")
+