From 5b9d000d1f1870e124ce797e113cf52ad32a3e99 Mon Sep 17 00:00:00 2001 From: loicalleyne Date: Wed, 6 Nov 2024 15:20:21 -0500 Subject: [PATCH] change schema serialization --- .gitignore | 3 +- README.md | 14 ++-- bodkin.go | 94 ++++++------------------- cmd/main.go | 16 ++++- json2parquet/.gitignore | 5 +- json2parquet/cmd/.gitignore | 2 + json2parquet/cmd/cleaner/main.go | 115 +++++++++++++++++++++++++++++++ json2parquet/cmd/main.go | 78 +++++++++++++++++++++ json2parquet/json2parquet.go | 2 +- 9 files changed, 242 insertions(+), 87 deletions(-) create mode 100644 json2parquet/cmd/.gitignore create mode 100644 json2parquet/cmd/cleaner/main.go create mode 100644 json2parquet/cmd/main.go diff --git a/.gitignore b/.gitignore index b4a493f..01d0172 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ go.work.sum avro/ map.go -*.schema \ No newline at end of file +*.schema +*.pgo \ No newline at end of file diff --git a/README.md b/README.md index 9d191b1..da58363 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Bodkin enables you to use your _data_ to define and evolve your Arrow Schema. - Automatically evolves the Arrow schema with new fields when providing new inputs - Converts schema field types when unifying schemas to accept evolving input data - Tracks changes to the schema -- Export/import a schema to/from a stub parquet file to persist schema definition +- Export/import a serialized Arrow schema to/from file or []byte to transmit or persist schema definition ## 🚀 Install @@ -150,11 +150,15 @@ for rdr.Next() { // ] ``` -Export your schema to a file, then import the file to retrieve the schema +Export your schema to a file, then import the file to retrieve the schema; or export/import to/from a []byte. ```go - _ = u.ExportSchema("./test.schema") - imp, _ := u.ImportSchema("./test.schema") - fmt.Printf("imported %v\n", imp.String()) +_ = u.ExportSchemaFile("./test.schema") +imp, _ := u.ImportSchemaFile("./test.schema") +fmt.Printf("imported %v\n", imp.String()) + +bs, _ := u.ExportSchemaBytes() +sc, _ := u.ImportSchemaBytes(bs) +fmt.Printf("imported %v\n", sc.String()) ``` ## 💫 Show your support diff --git a/bodkin.go b/bodkin.go index 45ca3bc..25296a3 100644 --- a/bodkin.go +++ b/bodkin.go @@ -13,15 +13,10 @@ import ( "strings" "github.com/apache/arrow-go/v18/arrow" - "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/flight" "github.com/apache/arrow-go/v18/arrow/memory" - "github.com/apache/arrow-go/v18/parquet" - "github.com/apache/arrow-go/v18/parquet/compress" - "github.com/apache/arrow-go/v18/parquet/file" - "github.com/apache/arrow-go/v18/parquet/pqarrow" "github.com/go-viper/mapstructure/v2" json "github.com/goccy/go-json" - "github.com/loicalleyne/bodkin/pq" omap "github.com/wk8/go-ordered-map/v2" ) @@ -205,90 +200,41 @@ func (u *Bodkin) Paths() []Field { return paths } -// ExportSchema exports an Arrow Schema by writing a record full of null values to an Arrow IPC file. -func (u *Bodkin) ExportSchema(exportPath string) error { - f, err := os.Open(exportPath) - if err != nil { - return err - } - defer f.Close() - +// ExportSchema exports a serialized Arrow Schema to a file. +func (u *Bodkin) ExportSchemaFile(exportPath string) error { schema, err := u.Schema() if err != nil { return err } - var prp *parquet.WriterProperties = parquet.NewWriterProperties( - parquet.WithDictionaryDefault(true), - parquet.WithVersion(parquet.V2_LATEST), - parquet.WithCompression(compress.Codecs.Zstd), - parquet.WithStats(true), - parquet.WithRootName("bodkin"), - ) - - pw, _, err := pq.NewParquetWriter(schema, prp, exportPath) + bs := flight.SerializeSchema(schema, memory.DefaultAllocator) + err = os.WriteFile("./temp.schema", bs, 0644) if err != nil { return err } - defer pw.Close() - - rb := array.NewRecordBuilder(memory.DefaultAllocator, schema) - m := make(map[string]any) - for _, c := range u.old.children { - switch c.arrowType { - case arrow.STRING: - if c.field.Type.ID() != arrow.LIST { - m[c.name] = "nevergonnagiveyouup" - } - case arrow.FLOAT64: - m[c.name] = 1.2345 - case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64, arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64: - m[c.name] = 1234 - } - } - data, err := json.Marshal(m) - if err != nil { - return err - } - // array.UnmarshalJSON for record builder will read in a single object and add the values to each field in the recordbuilder, - // missing fields will get a null and unexpected keys will be ignored. If reading in an array of records as a single batch, - // then use a structbuilder and use RecordFromStruct. This is fine as we are only interested in the Arrow schema. - for i := 0; i < 10; i++ { - err = rb.UnmarshalJSON(data) - if err != nil { - return err - } - } - - rec := rb.NewRecord() - err = pw.WriteRecord(rec) - if err != nil { - return err - } - - return f.Sync() + return nil } -// ImportSchema imports an Arrow Schema from an Arrow IPC file. -func (u *Bodkin) ImportSchema(importPath string) (*arrow.Schema, error) { - f, err := os.Open(importPath) - if err != nil { - return nil, err - } - defer f.Close() - pqr, err := file.OpenParquetFile(importPath, true) +// ImportSchema imports a serialized Arrow Schema from a file. +func (u *Bodkin) ImportSchemaFile(importPath string) (*arrow.Schema, error) { + dat, err := os.ReadFile(importPath) if err != nil { return nil, err } + return flight.DeserializeSchema(dat, memory.DefaultAllocator) +} - fr, err := pqarrow.NewFileReader(pqr, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator) +// ExportSchemaBytes exports a serialized Arrow Schema. +func (u *Bodkin) ExportSchemaBytes() ([]byte, error) { + schema, err := u.Schema() if err != nil { return nil, err } - schema, err := fr.Schema() - if schema == nil { - return nil, fmt.Errorf("could not import schema from %s : %v", importPath, err) - } - return schema, nil + return flight.SerializeSchema(schema, memory.DefaultAllocator), nil +} + +// ImportSchemaBytes imports a serialized Arrow Schema. +func (u *Bodkin) ImportSchemaBytes(dat []byte) (*arrow.Schema, error) { + return flight.DeserializeSchema(dat, memory.DefaultAllocator) } // Unify merges structured input's column definition with the previously input's schema. diff --git a/cmd/main.go b/cmd/main.go index 483882e..57464cb 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "log" "strings" "github.com/apache/arrow-go/v18/arrow/array" @@ -169,18 +170,27 @@ func main() { fmt.Printf("%v : [%s]\n", e.Issue, e.Dotpath) } fmt.Println(u.Changes()) - err = u.ExportSchema("./test.schema") + bs, err := u.ExportSchemaBytes() if err != nil { fmt.Println(err) } else { - imp, err := u.ImportSchema("./test.schema") + imp, err := u.ImportSchemaBytes(bs) if err != nil { fmt.Println(err) } else { fmt.Printf("imported %v\n", imp.String()) } } - + err = u.ExportSchemaFile("./temp.schema") + if err != nil { + log.Fatal(err) + } + sb, err := u.ImportSchemaFile("./temp.schema") + if err != nil { + fmt.Println(err) + } else { + fmt.Println("deserialized:\n", sb.String()) + } } var jsonS1 string = `{ diff --git a/json2parquet/.gitignore b/json2parquet/.gitignore index 4721a81..81d7499 100644 --- a/json2parquet/.gitignore +++ b/json2parquet/.gitignore @@ -1,3 +1,2 @@ -cmd/* -cmd/*.json -cmd/*.parquet \ No newline at end of file +*.json +*.parquet \ No newline at end of file diff --git a/json2parquet/cmd/.gitignore b/json2parquet/cmd/.gitignore new file mode 100644 index 0000000..81d7499 --- /dev/null +++ b/json2parquet/cmd/.gitignore @@ -0,0 +1,2 @@ +*.json +*.parquet \ No newline at end of file diff --git a/json2parquet/cmd/cleaner/main.go b/json2parquet/cmd/cleaner/main.go new file mode 100644 index 0000000..c296c97 --- /dev/null +++ b/json2parquet/cmd/cleaner/main.go @@ -0,0 +1,115 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "log" + "os" + "path/filepath" + + "github.com/goccy/go-json" + + "github.com/redpanda-data/benthos/v4/public/bloblang" +) + +// jcleaner takes as input a JSONL file, and removes all null fields, empty arrays, +// empty objects and empty strings. +func main() { + inputFile := flag.String("in", "", "input file") + outputFile := flag.String("out", "", "output file") + flag.Parse() + if *inputFile == "" { + log.Fatal("no input file specified") + } + if *outputFile == "" { + log.Fatal("no output file specified") + } + problemLines := fileNameWithoutExt(*outputFile) + "_problem.json" + f, err := os.Open(*inputFile) + if err != nil { + panic(err) + } + defer func() { + if r := recover(); r != nil { + fmt.Println(err) + } + }() + defer f.Close() + bloblangMapping := `map remove_null_empty { + root = match { + (this.type() == "object" && this.length() == 0) => deleted() + this.type() == "object" => this.map_each(i -> i.value.apply("remove_null_empty")) + (this.type() == "array" && this.length() == 0) => deleted() + this.type() == "array" => this.map_each(v -> v.apply("remove_null_empty")) + this.type() == "null" => deleted() + this.type() == "string" && this.length() == 0 => deleted() + } + } + root = this.apply("remove_null_empty")` + exe, err := bloblang.Parse(bloblangMapping) + if err != nil { + log.Println(err) + } + + nf, err := os.Create(*outputFile) + if err != nil { + panic(err) + } + defer nf.Close() + w := bufio.NewWriterSize(nf, 1024*4) + + pf, err := os.Create(problemLines) + if err != nil { + panic(err) + } + defer pf.Close() + pw := bufio.NewWriterSize(nf, 1024*4) + + r := bufio.NewReaderSize(f, 1024*4) + s := bufio.NewScanner(r) + newline := []byte("\n") + for s.Scan() { + y := s.Bytes() + b, err := ApplyBloblangMapping(y, exe) + if err != nil { + pw.Write(y) + pw.Write(newline) + continue + } + _, err = w.Write(b) + if err != nil { + pw.Write(y) + pw.Write(newline) + continue + } + w.Write(newline) + } + w.Flush() +} + +func ApplyBloblangMapping(jsonInput []byte, exe *bloblang.Executor) ([]byte, error) { + // Parse the JSON input into a map[string]interface{} + var inputMap map[string]interface{} + if err := json.Unmarshal(jsonInput, &inputMap); err != nil { + return nil, err + } + + // Execute the Bloblang mapping + res, err := exe.Query(inputMap) + if err != nil { + return nil, err + } + + // Convert the result back into a JSON string + jsonResult, err := json.Marshal(res) + if err != nil { + return nil, err + } + + return jsonResult, nil +} + +func fileNameWithoutExt(fileName string) string { + return fileName[:len(fileName)-len(filepath.Ext(fileName))] +} diff --git a/json2parquet/cmd/main.go b/json2parquet/cmd/main.go new file mode 100644 index 0000000..e047734 --- /dev/null +++ b/json2parquet/cmd/main.go @@ -0,0 +1,78 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "runtime/pprof" + + "github.com/redpanda-data/benthos/v4/public/bloblang" + + "github.com/loicalleyne/bodkin" + j2p "github.com/loicalleyne/bodkin/json2parquet" +) + +var exe *bloblang.Executor +var cpuprofile = flag.String("cpuprofile", "default.pgo", "write cpu profile to `file`") + +func main() { + inferMode := flag.Bool("infer_timeunits", true, "Infer date, time and timestamps from strings") + quotedValuesAreStrings := flag.Bool("quoted_values_are_strings", false, "Treat quoted bool, float and integer values as strings") + withTypeConversion := flag.Bool("type_conversion", false, "upgrade field types if data changes") + inputFile := flag.String("in", "t.json", "input file") + outputFile := flag.String("out", "screens.parquet", "output file") + dryRun := flag.Bool("n", false, "only print the schema") + lines := flag.Int64("lines", 0, "number of lines from which to infer schema; 0 means whole file is scanned") + flag.Parse() + if *inputFile == "" { + log.Fatal("no input file specified") + } + log.Println("detecting schema") + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer f.Close() + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + defer log.Printf("program ended\nto view profile run 'go tool pprof -http localhost:8080 %s\n", *cpuprofile) + } + var opts []bodkin.Option + if *inferMode { + opts = append(opts, bodkin.WithInferTimeUnits()) + } + if *withTypeConversion { + opts = append(opts, bodkin.WithTypeConversion()) + } + if *quotedValuesAreStrings { + opts = append(opts, bodkin.WithQuotedValuesAreStrings()) + } + if *lines != 0 { + opts = append(opts, bodkin.WithMaxCount(*lines)) + } + arrowSchema, n, err := j2p.SchemaFromFile(*inputFile, opts...) + if err == bodkin.ErrInvalidInput { + fmt.Printf("schema creation error %v\n", err) + } + if arrowSchema == nil { + log.Fatal("nil schema") + } + log.Printf("schema from %d records\n", n) + fmt.Println(arrowSchema.String()) + if !*dryRun { + if *outputFile == "" { + log.Fatal("no output file specified") + } + log.Println("starting conversion to parquet") + + n, err = j2p.RecordsFromFile(*inputFile, *outputFile, arrowSchema, nil) + log.Printf("%d records written", n) + if err != nil { + log.Printf("parquet error: %v", err) + } + } +} diff --git a/json2parquet/json2parquet.go b/json2parquet/json2parquet.go index 1c00c71..b94976b 100644 --- a/json2parquet/json2parquet.go +++ b/json2parquet/json2parquet.go @@ -39,7 +39,7 @@ func FromReader(r io.Reader, opts ...bodkin.Option) (*arrow.Schema, int64, error return schema, u.Count(), err } -func SchemaFromFile(inputFile string, opts ...bodkin.Option) (*arrow.Schema, int, error) { +func SchemaFromFile(inputFile string, opts ...bodkin.Option) (*arrow.Schema, int64, error) { f, err := os.Open(inputFile) if err != nil { return nil, 0, err