forked from digibib/marc2rdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoai_harvester.rb
125 lines (102 loc) · 3.86 KB
/
oai_harvester.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env ruby
# encoding: UTF-8
if RUBY_VERSION <= "1.8.7" then $KCODE = 'u' end #needed for string conversion in ruby 1.8.7
require 'bundler/setup'
require 'builder'
require 'rubygems'
require 'yaml'
require 'oai'
require 'marc'
require 'rdf'
require 'rdf/rdfxml'
require 'rdf/ntriples'
require 'rest_client'
CONFIG = YAML::load_file('config/config.yml')
MAPPINGFILE = YAML::load_file(CONFIG['mapping']['file'])
require './lib/rdfmodeler.rb'
require './lib/sparql_update.rb'
require './lib/string_replace.rb'
def usage(s)
$stderr.puts(s)
$stderr.puts("Usage: \n")
$stderr.puts("#{File.basename($0)} [-f fromdate] [-r recordlimit]\n")
$stderr.puts(" -r [number] stops processing after given number of records\n")
$stderr.puts(" -f 'date' harvests records starting from the given date. Default is yesterday.\n")
$stderr.puts(" -d debug output to stdout.\n")
exit(2)
end
# Defaults
$fromdate = Date.today.prev_day.to_s
loop { case ARGV[0]
when '-f' then ARGV.shift; $fromdate = ARGV.shift
when '-r' then ARGV.shift; $recordlimit = ARGV.shift.to_i # force integer
when '-d' then ARGV.shift; $debug = true
when '-h' then usage("help")
when /^-/ then usage("Unknown option: #{ARGV[0].inspect}")
else
break
end; }
=begin
Start processing
- load mappingfile tags into object 'yamltags'
- iterate MARC records
- model record tag by tag, match yaml file containing RDF mappings, iterate subfields either as array or one by one
- write processed record to OAI-PMH repository given in the config file
=end
@@yamltags = MAPPINGFILE['tags']
client = OAI::Client.new(CONFIG['oai']['repository_url'], {:redirects=>CONFIG['oai']['follow_redirects'], :parser=>CONFIG['oai']['parser'], :timeout=>CONFIG['oai']['timeout'], :debug=>true})
response = client.list_records :metadata_prefix =>CONFIG['oai']['format'], :from => $fromdate, :until => Date.today.to_s
# Pick out the first records
oairecords = Array.new
response.each do | oairecord |
oairecords << oairecord
end
# If we got a resumption token we need to loop until we have all the records
while(response.resumption_token and not response.resumption_token.empty?)
response = client.list_records(:resumption_token => response.resumption_token)
response.each do | oairecord |
oairecords << oairecord
end
end
i = 0
# start writer handle
RDF::Writer.for(:ntriples).buffer do |writer|
=begin main block
iterate and open writer
insert writer block into class variable @@writer for processing records real time
could be formal argument in ruby < 1.9
=end
@@writer = writer
oairecords.each do | oairecord |
i += 1
### offset and breaks for testing subset of marc records
#next unless i > 31000
#break if i > 33000
if $recordlimit then break if i > $recordlimit end
## OAI SPECIFIC PARSING ##
titlenumber = oairecord.header.identifier.split(':').last
## deleted record? ##
#if oairecord.header.status == "deleted"
if oairecord.deleted?
puts "deleted: #{titlenumber}"
RestClient.sparql_purge(titlenumber)
next # deleted records have no metadata in oai
else
puts "modified: #{titlenumber}"
## read metadata into MARCXML object
xmlreader = MARC::XMLReader.new(StringIO.new(oairecord.metadata.to_s))
#start parsing MARC records
xmlreader.each do | record |
# limit number of records for testing purpose
if $recordlimit then break if i > $recordlimit end
# initiate record and set type
rdfrecord = RDFModeler.new(record)
rdfrecord.set_type(CONFIG['resource']['resource_type'])
rdfrecord.marc2rdf_convert(record)
# and do sparql update, preserving harvested resources
RestClient.sparql_update(titlenumber, :preserve => CONFIG['oai']['preserve_on_update'])
end # end oairecord loop
end # end oairecords.deleted?
end # end oairecords.each
end # end writer loop
puts "modified records: #{i}"