Update README.md

BioJulia · Mar 9, 2023 · 395108b · 395108b
1 parent 079e24b
commit 395108b
Showing 1 changed file with 67 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -1,91 +1,83 @@
-Automa.jl
-=========
+# Automa.jl
 
 [![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://biojulia.github.io/Automa.jl/latest/)
 [![codecov.io](http://codecov.io/github/BioJulia/Automa.jl/coverage.svg?branch=master)](http://codecov.io/github/BioJulia/Automa.jl?branch=master)
 
-A Julia package for text validation, parsing, and tokenizing based on state machine compiler.
+Automa is a regex-to-Julia compiler.
+By compiling regex to Julia code in the form of `Expr` objects,
+Automa provides facilities to create efficient and robust regex-based lexers, tokenizers and parsers using Julia's metaprogramming capabilities. 
+You can view Automa as a regex engine that can insert arbitrary Julia code into its input matching process, which will be executed when certain parts of the regex matches an input.
 
-![Schema of Automa.jl](/docs/src/figure/Automa.png)
+![Schema of Automa.jl](figure/Automa.png)
 
-Automa.jl compiles regular expressions into Julia code, which is then compiled
-into low-level machine code by the Julia compiler. Automa.jl is designed to
-generate very efficient code to scan large text data, which is often much faster
-than handcrafted code. Automa.jl can insert arbitrary Julia code that will be
-executed in state transitions. This makes it possible, for example, to extract
-substrings that match a part of a regular expression.
+Automa is designed to generate very efficient code to scan large text data, often much faster than handcrafted code.
 
-This is a number literal tokenizer using Automa.jl ([numbers.jl](example/numbers.jl)):
+For more information [read the documentation](https://biojulia.github.io/Automa.jl/latest/), or read the examples below and in the `examples/` directory in this repository.
+
+## Examples
+### Validate some text only is composed of ASCII alphanumeric characters
 ```julia
-# A tokenizer of octal, decimal, hexadecimal and floating point numbers
-# =====================================================================
-
-import Automa
-import Automa.RegExp: @re_str
-const re = Automa.RegExp
-
-# Describe patterns in regular expression.
-oct      = re"0o[0-7]+"
-dec      = re"[-+]?[0-9]+"
-hex      = re"0x[0-9A-Fa-f]+"
-prefloat = re"[-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)"
-float    = prefloat | re.cat(prefloat | re"[-+]?[0-9]+", re"[eE][-+]?[0-9]+")
-number   = oct | dec | hex | float
-numbers  = re.cat(re.opt(number), re.rep(re" +" * number), re" *")
-
-# Register action names to regular expressions.
-number.actions[:enter] = [:mark]
-oct.actions[:exit]     = [:oct]
-dec.actions[:exit]     = [:dec]
-hex.actions[:exit]     = [:hex]
-float.actions[:exit]   = [:float]
-
-# Compile a finite-state machine.
-machine = Automa.compile(numbers)
-
-# This generates a SVG file to visualize the state machine.
-# write("numbers.dot", Automa.machine2dot(machine))
-# run(`dot -Tpng -o numbers.png numbers.dot`)
-
-# Bind an action code for each action name.
-actions = Dict(
-    :mark  => :(mark = p),
-    :oct   => :(emit(:oct)),
-    :dec   => :(emit(:dec)),
-    :hex   => :(emit(:hex)),
-    :float => :(emit(:float)),
-)
+using Automa
 
-# Generate a tokenizing function from the machine.
-context = Automa.CodeGenContext()
-@eval function tokenize(data::String)
-    tokens = Tuple{Symbol,String}[]
-    mark = 0
-    $(Automa.generate_init_code(context, machine))
-    emit(kind) = push!(tokens, (kind, data[mark:p-1]))
-    $(Automa.generate_exec_code(context, machine, actions))
-    return tokens, cs == 0 ? :ok : cs < 0 ? :error : :incomplete
-end
+generate_buffer_validator(:validate_alphanumeric, re"[a-zA-Z0-9]*") |> eval
 
-tokens, status = tokenize("1 0x0123BEEF 0o754 3.14 -1e4 +6.022045e23")
+for s in ["abc", "aU81m", "!,>"]
+    println("$s is alphanumeric? $(isnothing(validate_alphanumeric(s)))")
+end
 ```
 
-This emits tokens and the final status:
+### Making a lexer
+```julia
+using Automa
+
+tokens = [
+    :identifier => re"[A-Za-z_][0-9A-Za-z_!]*",
+    :lparens => re"\(",
+    :rparens => re"\)",
+    :comma => re",",
+    :quot => re"\"",
+    :space => re"[\t\f ]+",
+];
+@eval @enum Token errortoken $(first.(tokens)...)
+make_tokenizer((errortoken, 
+    [Token(i) => j for (i,j) in enumerate(last.(tokens))]
+)) |> eval
+
+collect(tokenize(Token, """(alpha, "beta15")"""))
+```
 
-    ~/.j/v/Automa (master) $ julia -qL example/numbers.jl
-    julia> tokens
-    6-element Array{Tuple{Symbol,String},1}:
-     (:dec,"1")
-     (:hex,"0x0123BEEF")
-     (:oct,"0o754")
-     (:float,"3.14")
-     (:float,"-1e4")
-     (:float,"+6.022045e23")
+### Make a simple TSV file parser
+```julia
+using Automa
+
+machine = let
+    name = onexit!(onenter!(re"[^\t\r\n]+", :mark), :name)
+    field = onexit!(onenter!(re"[^\t\r\n]+", :mark), :field)
+    nameline = name * rep('\t' * name)
+    record = onexit!(field * rep('\t' * field), :record)
+    compile(nameline * re"\r?\n" * record * rep(re"\r?\n" * record) * rep(re"\r?\n"))
+end
 
-    julia> status
-    :ok
+actions = Dict(
+    :mark => :(pos = p),
+    :name => :(push!(headers, String(data[pos:p-1]))),
+    :field => quote
+        n_fields += 1
+        push!(fields, String(data[pos:p-1]))
+    end,
+    :record => quote
+        n_fields == length(headers) || error("Malformed TSV")
+        n_fields = 0
+    end
+)
 
-The compiled deterministic finite automaton (DFA) looks like this:
-![DFA](/docs/src/figure/numbers.png)
+@eval function parse_tsv(data)
+    headers = String[]
+    fields = String[]
+    pos = n_fields = 0
+    $(generate_code(machine, actions))
+    (headers, reshape(fields, length(headers), :))
+end
 
-For more details, see [fasta.jl](/example/fasta.jl) and read the docs page.
+header, data = parse_tsv("a\tabc\n12\t13\r\nxyc\tz\n\n")
+```