Skip to content

Commit

Permalink
go doc comments
Browse files Browse the repository at this point in the history
  • Loading branch information
loicalleyne committed Oct 30, 2024
1 parent 130ec5e commit 205e54d
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 35 deletions.
55 changes: 48 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# Bodkin
Go library for decoding generic map values and native Go structures to Apache Arrow.
Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow.

The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet.

## Features

- Convert a structured input (json string or []byte, Go struct or map[string]any) into an Apache Arrow schema
- Evolve the schema with new fields by providing new inputs
- Convert schema field types when to accept evolving input schemas
- Track the changes to the schema
- Converts a structured input (json string or []byte, Go struct or map[string]any) into an Apache Arrow schema
- Supports nested types
- Automatically evolves the Arrow schema with new fields when providing new inputs
- Converts schema field types when unifying schemas to accept evolving input data
- Tracks changes to the schema

## 🚀 Install

Expand Down Expand Up @@ -49,8 +52,6 @@ fmt.Printf("original input %v\nerrors:\n%v\n", s.String(), err)
// errors:
// could not determine type of unpopulated field : [previous]
// could not determine element type of empty array : [arrayscalar]
// could not determine type of unpopulated field : [previous]
// could not determine element type of empty array : [arrayscalar]
```

Provide some more structured data and print out the new merged schema and the list of changes
Expand Down Expand Up @@ -92,6 +93,46 @@ fmt.Println(u.Changes())
// changed $timefield : from time64[ns] to utf8
```

Also works with Go structs
```go
stu := Student{
Name: "StudentName",
Age: 25,
ID: 123456,
Day: 123,
}
sch := School{
Name: "SchoolName",
Address: AddressType{
Country: "CountryName",
},
}
e, _ := bodkin.NewBodkin(stu, bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
sc, err := e.OriginSchema()
fmt.Printf("original input %v\nerrors:\n%v\n", sc.String(), err)
// original input schema:
// fields: 5
// - ID: type=int64, nullable
// - Day: type=int32, nullable
// - School: type=struct<Name: utf8, Address: struct<Street: utf8, City: utf8, Region: utf8, Country: utf8>>, nullable
// - Name: type=utf8, nullable
// - Age: type=int32, nullable
// errors:
// <nil>
e.Unify(sch)
sc, err = e.OriginSchema()
fmt.Printf("unified %v\nerrors:\n%v\n", sc.String(), err)
// unified schema:
// fields: 5
// - ID: type=int64, nullable
// - Day: type=int32, nullable
// - School: type=struct<Name: utf8, Address: struct<Street: utf8, City: utf8, Region: utf8, Country: utf8>>, nullable
// - Name: type=utf8, nullable
// - Age: type=int32, nullable
// errors:
// <nil>
```

Use the generated Arrow schema with Arrow's built-in JSON reader to decode JSON data into Arrow records
```go
rdr = array.NewJSONReader(strings.NewReader(jsonS2), schema)
Expand Down
11 changes: 7 additions & 4 deletions bodkin.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// Package bodkin is a Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow.
// The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet.
package bodkin

import (
Expand All @@ -9,6 +11,7 @@ import (
"github.com/goccy/go-json"
)

// Option configures a Bodkin
type (
Option func(config)
config *Bodkin
Expand All @@ -26,8 +29,8 @@ type Bodkin struct {
}

// NewBodkin returns a new Bodkin value from a structured input.
// Input must be a json byte slice or string, a struct with exported fields or map[string]any.
// Any uppopulated fields, empty objects or empty slices in the input are skipped as their
// Input must be a json byte slice or string, a Go struct with exported fields or map[string]any.
// Any uppopulated fields, empty objects or empty slices in JSON or map[string]any inputs are skipped as their
// types cannot be evaluated and converted.
func NewBodkin(a any, opts ...Option) (*Bodkin, error) {
m := map[string]interface{}{}
Expand Down Expand Up @@ -82,7 +85,7 @@ func (u *Bodkin) Changes() error { return u.changes }
// Times use a format of HH:MM or HH:MM:SS[.zzz] where the fractions of a second cannot
// exceed the precision allowed by the time unit, otherwise unmarshalling will error.
//
// # Dates use YYYY-MM-DD format
// Dates use YYYY-MM-DD format.
//
// Timestamps use RFC3339Nano format except without a timezone, all of the following are valid:
//
Expand All @@ -104,7 +107,7 @@ func WithTypeConversion() Option {
}

// Unify merges structured input's column definition with the previously input's schema.
// Any uppopulated fields, empty objects or empty slices in the input are skipped.
// Any uppopulated fields, empty objects or empty slices in JSON input are skipped.
func (u *Bodkin) Unify(a any) {
m := map[string]interface{}{}
switch input := a.(type) {
Expand Down
38 changes: 38 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,45 @@ import (
"github.com/loicalleyne/bodkin"
)

type AddressType struct {
Street string
City string
Region string
Country string
}
type School struct {
Name string
Address AddressType
}

type Student struct {
Name string
Age int32
ID int64
Day int32
School
}

func main() {
stu := Student{
Name: "StudentName",
Age: 25,
ID: 123456,
Day: 123,
}
sch := School{
Name: "SchoolName",
Address: AddressType{
Country: "CountryName",
},
}
e, _ := bodkin.NewBodkin(stu, bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
sc, err := e.OriginSchema()
fmt.Printf("original input %v\nerrors:\n%v\n", sc.String(), err)
e.Unify(sch)
sc, err = e.OriginSchema()
fmt.Printf("unified %v\nerrors:\n%v\n\n", sc.String(), err)

u, _ := bodkin.NewBodkin(jsonS1, bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
s, err := u.OriginSchema()
fmt.Printf("original input %v\nerrors:\n%v\n", s.String(), err)
Expand Down
59 changes: 35 additions & 24 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type fieldPos struct {
err error
}

// Schema evaluation/evolution errors.
var (
ErrUndefinedInput = errors.New("nil input")
ErrInvalidInput = errors.New("invalid input")
Expand All @@ -30,19 +31,23 @@ var (
ErrUndefinedArrayElementType = errors.New("could not determine element type of empty array")
ErrNotAnUpgradableType = errors.New("is not an upgradable type")
ErrPathNotFound = errors.New("path not found")
timestampMatchers []*regexp.Regexp
dateMatcher *regexp.Regexp
timeMatcher *regexp.Regexp
// UpgradableTypes are scalar types that can be upgraded to a more flexible type.
UpgradableTypes []arrow.Type = []arrow.Type{arrow.INT8,
arrow.INT16,
arrow.INT32,
arrow.INT64,
arrow.DATE32,
arrow.TIME64,
arrow.TIMESTAMP,
arrow.STRING,
}
)

// UpgradableTypes are scalar types that can be upgraded to a more flexible type.
var UpgradableTypes []arrow.Type = []arrow.Type{arrow.INT8,
arrow.INT16,
arrow.INT32,
arrow.INT64,
arrow.DATE32,
arrow.TIME64,
arrow.TIMESTAMP,
arrow.STRING,
}

var (
timestampMatchers []*regexp.Regexp
dateMatcher *regexp.Regexp
timeMatcher *regexp.Regexp
)

func init() {
Expand Down Expand Up @@ -104,6 +109,7 @@ func (f *fieldPos) mapChildren() {
}
}

// getPath returns a field found at a defined path, otherwise returns ErrPathNotFound.
func (f *fieldPos) getPath(path []string) (*fieldPos, error) {
if len(path) == 0 { // degenerate input
return nil, fmt.Errorf("getPath needs at least one key")
Expand Down Expand Up @@ -143,12 +149,12 @@ func (f *fieldPos) dotPath() string {
return path
}

// getValue retrieves the value from the map[string]interface{}
// getValue retrieves the value from the map[string]any
// by following the field's key path
func (f *fieldPos) getValue(m map[string]interface{}) interface{} {
var value interface{} = m
func (f *fieldPos) getValue(m map[string]any) any {
var value any = m
for _, key := range f.namePath() {
valueMap, ok := value.(map[string]interface{})
valueMap, ok := value.(map[string]any)
if !ok {
return nil
}
Expand Down Expand Up @@ -221,11 +227,13 @@ func errWrap(f *fieldPos) error {
return err
}

func mapToArrow(f *fieldPos, m map[string]interface{}) {
// mapToArrow traverses a map[string]any and creates a fieldPos tree from
// which an Arrow schema can be generated.
func mapToArrow(f *fieldPos, m map[string]any) {
for k, v := range m {
child := f.newChild(k)
switch t := v.(type) {
case map[string]interface{}:
case map[string]any:
mapToArrow(child, t)
var fields []arrow.Field
for _, c := range child.children {
Expand All @@ -236,7 +244,7 @@ func mapToArrow(f *fieldPos, m map[string]interface{}) {
f.assignChild(child)
}

case []interface{}:
case []any:
if len(t) <= 0 {
f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, child.namePath()))
} else {
Expand All @@ -258,9 +266,11 @@ func mapToArrow(f *fieldPos, m map[string]interface{}) {
f.field = arrow.Field{Name: f.name, Type: arrow.StructOf(fields...), Nullable: true}
}

func sliceElemType(f *fieldPos, v []interface{}) arrow.DataType {
// sliceElemType evaluates the slice type and returns an Arrow DataType
// to be used in building an Arrow Field.
func sliceElemType(f *fieldPos, v []any) arrow.DataType {
switch ft := v[0].(type) {
case map[string]interface{}:
case map[string]any:
child := f.newChild(f.name + ".elem")
mapToArrow(child, ft)
var fields []arrow.Field
Expand All @@ -269,13 +279,13 @@ func sliceElemType(f *fieldPos, v []interface{}) arrow.DataType {
}
f.assignChild(child)
return arrow.StructOf(fields...)
case []interface{}:
case []any:
if len(ft) < 1 {
f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, f.namePath()))
return arrow.GetExtensionType("skip")
}
child := f.newChild(f.name + ".elem")
et := sliceElemType(child, v[0].([]interface{}))
et := sliceElemType(child, v[0].([]any))
f.assignChild(child)
return arrow.ListOf(et)
default:
Expand All @@ -284,6 +294,7 @@ func sliceElemType(f *fieldPos, v []interface{}) arrow.DataType {
return nil
}

// goType2Arrow maps a Go type to an Arrow DataType.
func goType2Arrow(f *fieldPos, gt any) arrow.DataType {
var dt arrow.DataType
switch t := gt.(type) {
Expand Down

0 comments on commit 205e54d

Please sign in to comment.