From bb706317065e46ec53ce808347b500564dc2853d Mon Sep 17 00:00:00 2001 From: loicalleyne Date: Wed, 30 Oct 2024 11:45:00 -0400 Subject: [PATCH] add quotedValuesAreStrings option --- bodkin.go | 22 +++++++++++++++------- cmd/main.go | 3 ++- schema.go | 22 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/bodkin.go b/bodkin.go index b4287ad..c37a2e5 100644 --- a/bodkin.go +++ b/bodkin.go @@ -19,13 +19,14 @@ type ( // Bodkin is a collection of field paths, describing the columns of a structured input(s). type Bodkin struct { - original *fieldPos - old *fieldPos - new *fieldPos - inferTimeUnits bool - typeConversion bool - err error - changes error + original *fieldPos + old *fieldPos + new *fieldPos + inferTimeUnits bool + quotedValuesAreStrings bool + typeConversion bool + err error + changes error } // NewBodkin returns a new Bodkin value from a structured input. @@ -106,6 +107,13 @@ func WithTypeConversion() Option { } } +// WithTypeConversion enables upgrading the column types to fix compatibilty conflicts. +func WithQuotedValuesAreStrings() Option { + return func(cfg config) { + cfg.quotedValuesAreStrings = true + } +} + // Unify merges structured input's column definition with the previously input's schema. // Any uppopulated fields, empty objects or empty slices in JSON input are skipped. func (u *Bodkin) Unify(a any) { diff --git a/cmd/main.go b/cmd/main.go index 1cb27c4..a0d9f51 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -104,7 +104,8 @@ var jsonS1 string = `{ "results": [{"id":7594}], "arrayscalar":[], "datefield":"1979-01-01", - "timefield":"01:02:03" + "timefield":"01:02:03", + "boolquotedfield":"true" }` var jsonS2 string = `{ diff --git a/schema.go b/schema.go index 02d9e55..109a29c 100644 --- a/schema.go +++ b/schema.go @@ -44,14 +44,19 @@ var UpgradableTypes []arrow.Type = []arrow.Type{arrow.INT8, arrow.STRING, } +// Regular expressions and variables for type inference. var ( timestampMatchers []*regexp.Regexp dateMatcher *regexp.Regexp timeMatcher *regexp.Regexp + integerMatcher *regexp.Regexp + floatMatcher *regexp.Regexp + boolMatcher []string ) func init() { registerTsMatchers() + registerQuotedStringValueMatchers() } func registerTsMatchers() { @@ -64,6 +69,12 @@ func registerTsMatchers() { regexp.MustCompile(`^\d{4}-\d{1,2}-\d{1,2}[T ]\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})? *(([+-]\d{1,2}(:\d{1,2})?)|Z|UTC)?$`)) } +func registerQuotedStringValueMatchers() { + integerMatcher = regexp.MustCompile(`^[-+]?\d+$`) + floatMatcher = regexp.MustCompile(`^[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?$`) + boolMatcher = append(boolMatcher, "true", "false") +} + func newFieldPos(b *Bodkin) *fieldPos { f := new(fieldPos) f.owner = b @@ -352,6 +363,17 @@ func goType2Arrow(f *fieldPos, gt any) arrow.DataType { return arrow.FixedWidthTypes.Time64ns } } + if !f.owner.quotedValuesAreStrings { + if slices.Contains(boolMatcher, t) { + return arrow.FixedWidthTypes.Boolean + } + if integerMatcher.MatchString(t) { + return arrow.PrimitiveTypes.Int64 + } + if floatMatcher.MatchString(t) { + return arrow.PrimitiveTypes.Float64 + } + } dt = arrow.BinaryTypes.String case []byte: dt = arrow.BinaryTypes.Binary