-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrdf2rdf.go
193 lines (168 loc) · 4.28 KB
/
rdf2rdf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
package main
import (
"flag"
"fmt"
"io"
"log"
"os"
"time"
"unicode/utf8"
"github.com/knakk/rdf"
"github.com/mitchellh/ioprogress"
)
var usage = `rdf2rdf
-------
Convert between different RDF serialization formats.
Usage:
rdf2rdf -in=input.xml -out=output.ttl
Options:
-h --help Show this message.
-in Input file.
-out Output file.
-stream=true Streaming mode.
-v=false Verbose mode (shows progress indicator)
By default the converter is streaming both input and output, emitting
converted triples/quads as soon as they are available. This ensures you can
convert huge files with minimum memory footprint. However, if you have
small datasets you can choose to load all data into memory before conversion.
This makes it possible to sort the data, remove duplicate triples, and
potentially generate more compact Turtle serializations, maximizing predicate
and object lists. Do this by setting the flag stream=false.
Conversion from a quad-format to a triple-format will disregard the triple's
context (graph). Conversion from a triple-format to a quad-format is not
supported.
Input and ouput formats are determined by file extensions, according to
the following table:
Format | File extension
----------|-------------------
N-Triples | .nt
N-Quads | .nq
RDF/XML | .rdf .rdfxml .xml
Turtle | .ttl
`
func main() {
log.SetFlags(0)
log.SetPrefix("ERROR: ")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, usage)
}
input := flag.String("in", "", "Input file")
output := flag.String("out", "", "Output file")
verbose := flag.Bool("v", false, "Verbose mode")
stream := flag.Bool("stream", true, "Streaming mode")
flag.Parse()
if *input == "" || *output == "" {
fmt.Println("Usage:")
flag.PrintDefaults()
os.Exit(1)
}
inFile, err := os.Open(*input)
if err != nil {
log.Fatal(err)
}
defer inFile.Close()
stat, err := inFile.Stat()
if err != nil {
log.Fatal(err)
}
var inFileRdr io.Reader
if *verbose {
inFileRdr = &ioprogress.Reader{
Reader: inFile,
Size: stat.Size(),
DrawInterval: time.Microsecond,
DrawFunc: ioprogress.DrawTerminalf(os.Stdout, func(p, t int64) string {
return ioprogress.DrawTextFormatBytes(p, t)
}),
}
} else {
inFileRdr = inFile
}
outFile, err := os.Create(*output)
if err != nil {
log.Fatal(err)
}
defer outFile.Close()
inExt := fileExtension(*input)
outExt := fileExtension(*output)
if inExt == outExt {
log.Fatal("No conversion necessary. Input and output formats are identical.")
}
var inFormat, outFormat rdf.Format
switch inExt {
case "nt":
inFormat = rdf.NTriples
case "nq":
inFormat = rdf.NQuads
case "ttl":
inFormat = rdf.Turtle
case "xml", "rdf", "rdfxml":
inFormat = rdf.RDFXML
case "":
log.Fatal("Unknown file format. No file extension on input file.")
default:
log.Fatalf("Unsupported file exension on input file: %s", inFile.Name())
}
switch outExt {
case "nt":
outFormat = rdf.NTriples
case "nq":
// No other quad-formats supported ATM
log.Fatal("Serializing to N-Quads currently not supported.")
case "ttl":
outFormat = rdf.Turtle
case "":
log.Fatal("Unknown file format. No file extension on output file.")
default:
log.Fatalf("Unsupported file exension on output file: %s", outFile.Name())
}
t0 := time.Now()
n := tripleToTriple(inFileRdr, outFile, inFormat, outFormat, *stream)
if *verbose {
fmt.Printf("Done. Converted %d triples in %v.\n", n, time.Now().Sub(t0))
}
}
func tripleToTriple(inFile io.Reader, outFile io.Writer, inFormat, outFormat rdf.Format, stream bool) int {
dec := rdf.NewTripleDecoder(inFile, inFormat)
// TODO set base to file name?
enc := rdf.NewTripleEncoder(outFile, outFormat)
i := 0
if stream {
for t, err := dec.Decode(); err != io.EOF; t, err = dec.Decode() {
if err != nil {
log.Fatal(err)
}
err = enc.Encode(t)
if err != nil {
log.Fatal(err)
}
i++
}
} else {
tr, err := dec.DecodeAll()
if err != nil {
log.Fatal(err)
}
err = enc.EncodeAll(tr)
if err != nil {
log.Fatal(err)
}
i = len(tr)
}
err := enc.Close()
if err != nil {
log.Fatal(err)
}
return i
}
func fileExtension(s string) string {
i := len(s)
for i > 0 {
r, w := utf8.DecodeLastRuneInString(s[0:i])
if r == '.' {
return s[i:len(s)]
}
i -= w
}
return "not found"
}