Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MIME types for most popular formats #188

Merged
merged 17 commits into from
Jan 26, 2021
3 changes: 3 additions & 0 deletions lib/archive.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def to_json(*a)
# it can be placed here
attr_accessor :intrinsics

# The MIME type of the archive
attr_accessor :content_type

# Only permits assignments via defined accessors
def initialize(**attributes)
attributes.map { |(k, v)| public_send("#{k}=", v) }
Expand Down
3 changes: 3 additions & 0 deletions lib/audio.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class Audio
# it can be placed here
attr_accessor :intrinsics

# The MIME type of the sound file
attr_accessor :content_type

# Only permits assignments via defined accessors
def initialize(**attributes)
attributes.map { |(k, v)| public_send("#{k}=", v) }
Expand Down
1 change: 1 addition & 0 deletions lib/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Document
attr_accessor :format
attr_accessor :document_type
attr_accessor :page_count
attr_accessor :content_type

# Only permits assignments via defined accessors
def initialize(**attributes)
Expand Down
3 changes: 3 additions & 0 deletions lib/image.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ class Image
# it can be placed here
attr_accessor :intrinsics

# The MIME type of the image file
attr_accessor :content_type

# Only permits assignments via defined accessors
def initialize(**attributes)
attributes.map { |(k, v)| public_send("#{k}=", v) }
Expand Down
5 changes: 4 additions & 1 deletion lib/parsers/aiff_parser.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
class FormatParser::AIFFParser
include FormatParser::IOUtils

AIFF_MIME_TYPE = 'audio/x-aiff'

# Known chunk types we can omit when parsing,
# grossly lifted from http://www.muratnkonar.com/aiff/
KNOWN_CHUNKS = [
Expand Down Expand Up @@ -70,7 +72,8 @@ def unpack_comm_chunk(io)
num_audio_channels: channels,
audio_sample_rate_hz: sample_rate.to_i,
media_duration_frames: sample_frames,
media_duration_seconds: duration_in_seconds
media_duration_seconds: duration_in_seconds,
content_type: AIFF_MIME_TYPE,
)
end

Expand Down
3 changes: 3 additions & 0 deletions lib/parsers/bmp_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ class FormatParser::BMPParser

VALID_BMP = 'BM'
PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 26..512
BMP_MIME_TYPE = 'image/bmp'

def likely_match?(filename)
filename =~ /\.bmp$/i
Expand Down Expand Up @@ -42,6 +43,7 @@ def parse_bitmap_core_header(dib_header)
width_px: width,
height_px: height,
color_mode: :rgb,
content_type: BMP_MIME_TYPE,
intrinsics: {
data_order: data_order,
bits_per_pixel: bit_depth
Expand All @@ -63,6 +65,7 @@ def parse_modern_header(dib_header)
width_px: width,
height_px: height.abs,
color_mode: :rgb,
content_type: BMP_MIME_TYPE,
intrinsics: {
vertical_resolution: vertical_res,
horizontal_resolution: horizontal_res,
Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/cr2_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class FormatParser::CR2Parser

TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
CR2_HEADER = [0x43, 0x52, 0x02, 0x00]
CR2_MIME_TYPE = 'image/x-canon-cr2'

def likely_match?(filename)
filename =~ /\.cr2$/i
Expand Down Expand Up @@ -39,6 +40,7 @@ def call(io)
display_height_px: exif_data.rotated? ? w : h,
orientation: exif_data.orientation_sym,
intrinsics: {exif: exif_data},
content_type: CR2_MIME_TYPE,
)
rescue EXIFR::MalformedTIFF
nil
Expand Down
6 changes: 6 additions & 0 deletions lib/parsers/dpx_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ class FormatParser::DPXParser
BE_MAGIC = 'SDPX'
LE_MAGIC = BE_MAGIC.reverse

# There is no official MIME type for DPX, so we have
# to invent something useful. We will prefix it with x-
# to indicate that it is a vendor subtype
DPX_MIME_TYPE = 'image/x-dpx'

class ByteOrderHintIO < SimpleDelegator
def initialize(io, is_little_endian)
super(io)
Expand Down Expand Up @@ -61,6 +66,7 @@ def call(io)
display_width_px: display_w,
display_height_px: display_h,
intrinsics: dpx_structure,
content_type: DPX_MIME_TYPE,
)
end

Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/flac_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ class FormatParser::FLACParser
MAGIC_BYTES = 4
MAGIC_BYTE_STRING = 'fLaC'
BLOCK_HEADER_BYTES = 4
FLAC_MIME_TYPE = 'audio/x-flac'

def likely_match?(filename)
filename =~ /\.flac$/i
Expand Down Expand Up @@ -61,6 +62,7 @@ def call(io)
audio_sample_rate_hz: sample_rate,
media_duration_seconds: duration,
media_duration_frames: total_samples,
content_type: FLAC_MIME_TYPE,
intrinsics: {
bits_per_sample: bits_per_sample,
minimum_frame_size: minimum_frame_size,
Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/gif_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ class FormatParser::GIFParser

HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
GIF_MIME_TYPE = 'image/gif'

def likely_match?(filename)
filename =~ /\.gif$/i
Expand Down Expand Up @@ -45,6 +46,7 @@ def call(io)
height_px: h,
has_multiple_frames: is_animated,
color_mode: :indexed,
content_type: GIF_MIME_TYPE
)
end

Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/jpeg_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class InvalidStructure < StandardError
APP1_MARKER = 0xE1 # maybe EXIF
EXIF_MAGIC_STRING = "Exif\0\0".b
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
JPEG_MIME_TYPE = 'image/jpeg'

def self.likely_match?(filename)
filename =~ /\.jpe?g$/i
Expand Down Expand Up @@ -88,6 +89,7 @@ def scan
display_height_px: dh,
orientation: flat_exif.orientation_sym,
intrinsics: {exif: flat_exif},
content_type: JPEG_MIME_TYPE
)

return result
Expand Down
4 changes: 3 additions & 1 deletion lib/parsers/m3u_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class FormatParser::M3UParser
include FormatParser::IOUtils

HEADER = '#EXTM3U'
M3U8_MIME_TYPE = 'application/vnd.apple.mpegurl' # https://en.wikipedia.org/wiki/M3U#Internet_media_types

def likely_match?(filename)
filename =~ /\.m3u8?$/i
Expand All @@ -14,7 +15,8 @@ def call(io)
return unless HEADER.eql?(header)

FormatParser::Text.new(
format: :m3u
format: :m3u,
content_type: M3U8_MIME_TYPE,
)
end
FormatParser.register_parser new, natures: :text, formats: :m3u
Expand Down
11 changes: 10 additions & 1 deletion lib/parsers/moov_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ class FormatParser::MOOVParser
'm4a ' => :m4a,
}

# https://tools.ietf.org/html/rfc4337#section-2
# There is also video/quicktime which we should be able to capture
# here, but there is currently no detection for MOVs versus MP4s
MP4_AU_MIME_TYPE = 'audio/mp4'
MP4_MIXED_MIME_TYPE = 'video/mp4'

def likely_match?(filename)
filename =~ /\.(mov|m4a|ma4|mp4|aac|m4v)$/i
end
Expand Down Expand Up @@ -49,10 +55,12 @@ def call(io)
end

# M4A only contains audio, while MP4 and friends can contain video.
if format_from_moov_type(file_type) == :m4a
fmt = format_from_moov_type(file_type)
if fmt == :m4a
FormatParser::Audio.new(
format: format_from_moov_type(file_type),
media_duration_seconds: media_duration_s,
content_type: MP4_AU_MIME_TYPE,
intrinsics: atom_tree,
)
else
Expand All @@ -61,6 +69,7 @@ def call(io)
width_px: width,
height_px: height,
media_duration_seconds: media_duration_s,
content_type: MP4_MIXED_MIME_TYPE,
intrinsics: atom_tree,
)
end
Expand Down
5 changes: 3 additions & 2 deletions lib/parsers/mp3_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class InvalidDeepFetch < KeyError
MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
TIFF_HEADER_BYTES = [MAGIC_LE, MAGIC_BE]

MP3_MIME_TYPE = 'audio/mpeg'
# Wraps the Tag object returned by ID3Tag in such
# a way that a usable JSON representation gets
# returned
Expand Down Expand Up @@ -104,7 +104,8 @@ def call(raw_io)
# do not tell anything of substance
num_audio_channels: first_frame.channels,
audio_sample_rate_hz: first_frame.sample_rate,
intrinsics: id3tags_hash.merge(id3tags: tags)
intrinsics: id3tags_hash.merge(id3tags: tags),
content_type: MP3_MIME_TYPE,
)

extra_file_attirbutes = fetch_extra_attributes_from_id3_tags(id3tags_hash)
Expand Down
5 changes: 3 additions & 2 deletions lib/parsers/ogg_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
class FormatParser::OggParser
include FormatParser::IOUtils

# Maximum size of an Ogg page
MAX_POSSIBLE_PAGE_SIZE = 65307
OGG_MIME_TYPE = 'audio/ogg'

def likely_match?(filename)
filename =~ /\.ogg$/i
Expand Down Expand Up @@ -45,7 +45,8 @@ def call(io)
format: :ogg,
audio_sample_rate_hz: sample_rate,
num_audio_channels: channels,
media_duration_seconds: duration
media_duration_seconds: duration,
content_type: OGG_MIME_TYPE,
)
end

Expand Down
4 changes: 2 additions & 2 deletions lib/parsers/pdf_parser.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
class FormatParser::PDFParser
include FormatParser::IOUtils

# First 9 bytes of a PDF should be in this format, according to:
#
# https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
#
# There are however exceptions, which are left out for now.
#
PDF_MARKER = /%PDF-1\.[0-8]{1}/
PDF_CONTENT_TYPE = 'application/pdf'

def likely_match?(filename)
filename =~ /\.(pdf|ai)$/i
Expand All @@ -18,7 +18,7 @@ def call(io)

return unless safe_read(io, 9) =~ PDF_MARKER

FormatParser::Document.new(format: :pdf)
FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
end

FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/png_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class FormatParser::PNGParser
4 => true, # Grayscale with alpha
6 => true,
}
PNG_MIME_TYPE = 'image/png'

def likely_match?(filename)
filename =~ /\.png$/i
Expand Down Expand Up @@ -67,6 +68,7 @@ def call(io)
color_mode: color_mode,
has_multiple_frames: has_animation,
num_animation_or_video_frames: num_frames,
content_type: PNG_MIME_TYPE,
)
end

Expand Down
2 changes: 2 additions & 0 deletions lib/parsers/psd_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class FormatParser::PSDParser
include FormatParser::IOUtils

PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
PSD_MIME_TYPE = 'application/x-photoshop'

def likely_match?(filename)
filename =~ /\.psd$/i # Maybe also PSB at some point
Expand All @@ -20,6 +21,7 @@ def call(io)
format: :psd,
width_px: w,
height_px: h,
content_type: PSD_MIME_TYPE,
)
end

Expand Down
12 changes: 10 additions & 2 deletions lib/parsers/tiff_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ class FormatParser::TIFFParser
MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
HEADER_BYTES = [MAGIC_LE, MAGIC_BE]
TIFF_MIME_TYPE = 'image/tiff'
ARW_MIME_TYPE = 'image/x-sony-arw'

def likely_match?(filename)
filename =~ /\.tiff?$/i
Expand All @@ -14,7 +16,10 @@ def call(io)
io = FormatParser::IOConstraint.new(io)

return unless HEADER_BYTES.include?(safe_read(io, 4))
io.seek(io.pos + 2) # Skip over the offset of the IFD, EXIFR will re-read it anyway

# Skip over the offset of the IFD,
# EXIFR will re-read it anyway
io.seek(io.pos + 2)
return if cr2?(io)

# The TIFF scanner in EXIFR is plenty good enough,
Expand All @@ -26,14 +31,17 @@ def call(io)
w = exif_data.width || exif_data.pixel_x_dimension
h = exif_data.height || exif_data.pixel_y_dimension

format = arw?(exif_data) ? :arw : :tif
mime_type = arw?(exif_data) ? ARW_MIME_TYPE : TIFF_MIME_TYPE
FormatParser::Image.new(
format: arw?(exif_data) ? :arw : :tif, # Specify format as arw for Sony ARW format images, else tif
format: format,
width_px: w,
height_px: h,
display_width_px: exif_data.rotated? ? h : w,
display_height_px: exif_data.rotated? ? w : h,
orientation: exif_data.orientation_sym,
intrinsics: {exif: exif_data},
content_type: mime_type,
)
rescue EXIFR::MalformedTIFF
nil
Expand Down
3 changes: 3 additions & 0 deletions lib/parsers/wav_parser.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
class FormatParser::WAVParser
include FormatParser::IOUtils

WAV_MIME_TYPE = 'audio/x-wav'

def likely_match?(filename)
filename =~ /\.wav$/i
end
Expand Down Expand Up @@ -96,6 +98,7 @@ def file_info(fmt_data, sample_frames)
audio_sample_rate_hz: fmt_data[:sample_rate],
media_duration_frames: sample_frames,
media_duration_seconds: duration_in_seconds,
content_type: WAV_MIME_TYPE,
)
end

Expand Down
8 changes: 5 additions & 3 deletions lib/parsers/zip_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ class FormatParser::ZIPParser
include OfficeFormats
include FormatParser::IOUtils

ZIP_MIME_TYPE = 'application/zip'

def likely_match?(filename)
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
end
Expand All @@ -25,10 +27,10 @@ def call(io)
end

if office_document?(filenames_set)
office_format = office_file_format_from_entry_set(filenames_set)
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
else
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
end
rescue FileReader::Error
# This is not a ZIP, or a broken ZIP.
Expand Down
Loading