Skip to content

Commit

Permalink
add isbn filter
Browse files Browse the repository at this point in the history
  • Loading branch information
miku committed May 27, 2024
1 parent 2ae97a2 commit 0ae26e8
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 12 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ MAKEFLAGS := --jobs=$(shell nproc)

.PHONY: all assets bench clean clean-docs cloc deb imports lint members names rpm test vet

all: $(TARGETS)

# http://docs.travis-ci.com/user/languages/go/#Default-Test-Script
test:
# go get github.com/kylelemons/godebug/pretty
# go get github.com/kr/pretty
go test -v -cover ./...
# go mod tidy

all: $(TARGETS)

$(TARGETS): %: cmd/%/main.go
go build -ldflags="-w -s -linkmode=external" -o $@ $<

Expand Down
23 changes: 14 additions & 9 deletions filter/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
//
// Taggable document should expose (maybe via interfaces):
//
// SerialNumbers() []string
// PublicationTitle() string
// Date() string
// Volume() string
// Issue() string
// DatabaseName() string
// SerialNumbers() []string
// PublicationTitle() string
// Date() string
// Volume() string
// Issue() string
// DatabaseName() string
//
// Tagger configuration, e.g. preferred method, failure tolerance.
//
// tagger.Tag(v interface{}) []string { ... }
// Tagger configuration, e.g. preferred method, failure tolerance.
//
// tagger.Tag(v interface{}) []string { ... }
package filter

import (
Expand Down Expand Up @@ -96,6 +95,12 @@ func unmarshalFilter(name string, raw json.RawMessage) (Filter, error) {
return nil, err
}
return &filter, nil
case "isbn":
var filter ISBNFilter
if err := json.Unmarshal(raw, &filter); err != nil {
return nil, err
}
return &filter, nil
case "package":
var filter PackageFilter
if err := json.Unmarshal(raw, &filter); err != nil {
Expand Down
83 changes: 83 additions & 0 deletions filter/isbn.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package filter

import (
"strings"

"github.com/segmentio/encoding/json"

log "github.com/sirupsen/logrus"

"github.com/miku/span/container"
"github.com/miku/span/formats/finc"
"github.com/miku/span/strutil"
"github.com/miku/span/xio"
)

// ISBNFilter allows records with a certain ISBN.
type ISBNFilter struct {
Values *container.StringSet
}

// Apply applies ISBN filter on intermediate schema, no distinction between
// print or electronic ISBN.
func (f *ISBNFilter) Apply(is finc.IntermediateSchema) bool {
for _, isbn := range is.ISBN {
if f.Values.Contains(isbn) {
return true
}
}
return false
}

// UnmarshalJSON turns a config fragment into a filter.
func (f *ISBNFilter) UnmarshalJSON(p []byte) error {
var s struct {
ISBN struct {
Values []string `json:"list"`
File string `json:"file"`
Link string `json:"url"`
} `json:"isbn"`
}
if err := json.Unmarshal(p, &s); err != nil {
return err
}
f.Values = container.NewStringSet()
// workaround as span-freeze replacing urls with "file://" protocol and
// http.Get does not recognize that protocol
if strings.HasPrefix(s.ISBN.Link, "file://") {
s.ISBN.File = s.ISBN.Link[7:]
s.ISBN.Link = ""
}
if s.ISBN.Link != "" {
slink := xio.SavedLink{Link: s.ISBN.Link}
filename, err := slink.Save()
if err != nil {
return err
}
defer slink.Remove()
s.ISBN.File = filename
}
if s.ISBN.File != "" {
lines, err := xio.ReadLines(s.ISBN.File)
if err != nil {
return err
}
for _, line := range lines {
// Valid ISBN can contain x, normalize to uppercase.
line = strings.ToUpper(line)
// Sniff ISBNs.
isbns := container.NewStringSet()
for _, s := range strutil.ISBNPattern.FindAllString(line, -1) {
isbns.Add(s)
}
if isbns.Size() == 0 {
log.Printf("isbn: warning: no ISBNs found on line: %s", line)
}
f.Values.Add(isbns.Values()...)
}
}
// Add any ISBN given as string in configuration.
f.Values.Add(s.ISBN.Values...)
log.Printf("isbn: collected %d ISBN", f.Values.Size())
return nil
}
8 changes: 7 additions & 1 deletion formats/crossref/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,15 @@ type Document struct {
Created DateField `json:"created"`
DOI string
Deposited DateField `json:"deposited"`
ISBN []string
ISSN []string
Indexed DateField `json:"indexed"`
IsReferencedByCount int64 `json:"is-referenced-by-count"`
IssnType []struct {
IsbnType []struct {
Type string `json:"type"`
Value string `json:"value"`
} `json:"isbn-type"`
IssnType []struct {
Type string `json:"type"`
Value string `json:"value"`
} `json:"issn-type"`
Expand Down Expand Up @@ -328,6 +333,7 @@ func (doc *Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
output.DOI = doc.DOI // refs #6312 and #10923, most // URL seem valid
output.Format = Formats.Lookup(doc.Type, DefaultFormat)
output.Genre = Genres.Lookup(doc.Type, "unknown")
output.ISBN = doc.ISBN
output.ISSN = doc.ISSN
output.Issue = strings.TrimLeft(doc.Issue, "0")
output.Languages = doc.FindLanguages()
Expand Down
3 changes: 3 additions & 0 deletions strutil/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (
// ISSNPattern is a regular expression matching standard ISSN.
var ISSNPattern = regexp.MustCompile(`[0-9]{4,4}-[0-9]{3,3}[0-9X]`)

// ISBNPattern to match ISBN
var ISBNPattern = regexp.MustCompile(`(?i)(ISBN)?[\d\s\-]+X?`)

// Truncate truncates a string.
func Truncate(s string, length int) string {
if len(s) < length || length < 0 {
Expand Down

0 comments on commit 0ae26e8

Please sign in to comment.