forked from mime-types/ruby-mime-types
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rakefile
205 lines (168 loc) · 5.62 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- ruby encoding: utf-8 -*-
require 'rubygems'
require 'hoe'
Hoe.plugin :bundler
Hoe.plugin :doofus
Hoe.plugin :email
Hoe.plugin :gemspec
Hoe.plugin :git
Hoe.plugin :rubyforge
Hoe.plugin :minitest
Hoe.plugin :travis
spec = Hoe.spec 'mime-types' do
developer('Austin Ziegler', '[email protected]')
self.remote_rdoc_dir = '.'
self.rsync_args << ' --exclude=statsvn/'
self.history_file = 'History.rdoc'
self.readme_file = 'README.rdoc'
self.extra_rdoc_files = FileList["*.rdoc"].to_a
self.extra_dev_deps << ['hoe-bundler', '~> 1.2']
self.extra_dev_deps << ['hoe-doofus', '~> 1.0']
self.extra_dev_deps << ['hoe-gemspec', '~> 1.0']
self.extra_dev_deps << ['hoe-git', '~> 1.5']
self.extra_dev_deps << ['hoe-rubygems', '~> 1.0']
self.extra_dev_deps << ['hoe-travis', '~> 1.2']
self.extra_dev_deps << ['minitest', '~> 4.5']
self.extra_dev_deps << ['nokogiri', '~> 1.5']
self.extra_dev_deps << ['rake', '~> 10.0']
end
namespace :mime do
desc "Download the current MIME type registrations from IANA."
task :iana, :save, :destination do |t, args|
save_type = (args.save || :text).to_sym
case save_type
when :text, :both, :html
nil
else
raise "Unknown save type provided. Must be one of text, both, or html."
end
destination = args.destination || "type-lists"
require 'open-uri'
require 'nokogiri'
require 'cgi'
class IANAParser
include Comparable
INDEX = %q(http://www.iana.org/assignments/media-types/)
CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)}
RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt}
IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt}
IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)}
class << self
def load_index
@types ||= {}
Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag|
href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href'])
next if href_match.nil?
type = href_match.captures[0]
@types[tag.content] = IANAParser.new(tag.content, type)
end
end
attr_reader :types
end
def initialize(name, type)
@name = name
@type = type
@url = File.join(INDEX, @type)
end
attr_reader :name
attr_reader :type
attr_reader :url
attr_reader :html
def download(name = nil)
@html = Nokogiri::HTML(open(name || @url) { |f| f.read })
end
def save_html
File.open("#@name.html", "wb") { |w| w.write @html }
end
def <=>(o)
self.name <=> o.name
end
def parse
nodes = html.xpath("//table//table//tr")
# How many <td> children does the first node have?
node_count = nodes.first.children.select { |n| n.elem? }.size
if node_count == 1
# The title node doesn't have what we expect. Let's try it based
# on the first real node.
node_count = nodes.first.next.children.select { |n| n.elem? }.size
end
@mime_types = nodes.map do |node|
next if node == nodes.first
elems = node.children.select { |n| n.elem? }
next if elems.size.zero?
if elems.size != node_count
warn "size mismatch (#{elems.size} != #{node_count}) in node: #{node}"
next
end
case elems.size
when 3
subtype_index = 1
refnode_index = 2
when 4
subtype_index = 1
refnode_index = 3
else
raise "Unknown element size."
end
subtype = elems[subtype_index].content.chomp.strip
refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref|
case ref['href']
when CONTACT_PEOPLE
tag = CGI::unescape($1).chomp.strip
if tag == ref.content
"[#{ref.content}]"
else
"[#{ref.content}=#{tag}]"
end
when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS
"RFC#$1"
when %r{(https?://.*)}
"{#{ref.content}=#$1}"
else
ref
end
}
refs = refnodes.join(',')
"#@type/#{subtype} 'IANA,#{refs}"
end.compact
@mime_types
end
def save_text
File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") }
end
end
puts "Downloading index of MIME types from #{IANAParser::INDEX}."
IANAParser.load_index
require 'fileutils'
FileUtils.mkdir_p destination
Dir.chdir destination do
IANAParser.types.values.sort.each do |parser|
next if parser.name == "example" or parser.name == "mime"
puts "Downloading #{parser.name} from #{parser.url}"
parser.download
if :html == save_type || :both == save_type
puts "Saving #{parser.name}.html"
parser.save_html
end
if :text == save_type || :both == save_type
puts "Parsing #{parser.name} HTML"
parser.parse
puts "Saving #{parser.name}.txt"
parser.save_text
end
end
end
end
desc "Shows known MIME type sources."
task :mime_type_sources do
puts <<-EOS
http://www.ltsw.se/knbase/internet/mime.htp
http://www.webmaster-toolkit.com/mime-types.shtml
http://plugindoc.mozdev.org/winmime.php
http://standards.freedesktop.org/shared-mime-info-spec/shared-mime-info-spec-latest.html
http://www.feedforall.com/mime-types.htm
http://www.iana.org/assignments/media-types/
EOS
end
end
# vim: syntax=ruby