debian-mirror-gitlab/lib/gitlab/sanitizers/exif.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

197 lines
5.8 KiB
Ruby
Raw Normal View History

2019-04-03 18:18:56 +05:30
# frozen_string_literal: true
module Gitlab
module Sanitizers
class Exif
# these tags are not removed from the image
WHITELISTED_TAGS = %w(
ResolutionUnit
XResolution
YResolution
YCbCrSubSampling
YCbCrPositioning
BitsPerSample
ImageHeight
ImageWidth
ImageSize
Copyright
CopyrightNotice
Orientation
).freeze
# these tags are common in exiftool output, these
# do not contain any sensitive information, but
# we don't need to preserve them when removing
# exif tags
IGNORED_TAGS = %w(
ColorComponents
EncodingProcess
ExifByteOrder
ExifToolVersion
JFIFVersion
Directory
FileAccessDate
FileInodeChangeDate
FileModifyDate
FileName
FilePermissions
FileSize
SourceFile
Megapixels
FileType
FileTypeExtension
MIMEType
).freeze
ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS
EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" }
2021-04-15 22:33:27 +05:30
ALLOWED_MIME_TYPES = %w(image/jpeg image/tiff).freeze
2019-04-03 18:18:56 +05:30
attr_reader :logger
2020-11-24 15:15:51 +05:30
def initialize(logger: Gitlab::AppLogger)
2019-04-03 18:18:56 +05:30
@logger = logger
end
# rubocop: disable CodeReuse/ActiveRecord
2019-09-04 21:01:54 +05:30
def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil, uploader: nil, since: nil)
2019-04-03 18:18:56 +05:30
relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?',
'%.jpg', '%.jpeg', '%.tiff')
2019-09-04 21:01:54 +05:30
relation = relation.where(uploader: uploader) if uploader
relation = relation.where('created_at > ?', since) if since
2019-04-03 18:18:56 +05:30
logger.info "running in dry run mode, no images will be rewritten" if dry_run
find_params = {
start: start_id.present? ? start_id.to_i : nil,
2019-09-04 21:01:54 +05:30
finish: stop_id.present? ? stop_id.to_i : Upload.last&.id,
batch_size: 1000
2019-04-03 18:18:56 +05:30
}
2021-02-22 17:27:13 +05:30
relation.find_each(**find_params) do |upload|
2019-12-21 20:55:43 +05:30
clean(upload.retrieve_uploader, dry_run: dry_run)
2019-07-07 11:18:12 +05:30
sleep sleep_time if sleep_time
2021-06-08 01:23:25 +05:30
rescue StandardError => err
2019-07-07 11:18:12 +05:30
logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}"
logger.debug err.backtrace.join("\n ")
2019-04-03 18:18:56 +05:30
end
end
# rubocop: enable CodeReuse/ActiveRecord
def clean(uploader, dry_run: true)
Dir.mktmpdir('gitlab-exif') do |tmpdir|
src_path = fetch_upload_to_file(uploader, tmpdir)
to_remove = extra_tags(src_path)
if to_remove.empty?
logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping"
break
end
logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}"
break if dry_run
remove_and_store(tmpdir, src_path, uploader)
end
end
2022-05-07 20:08:51 +05:30
def clean_existing_path(src_path, dry_run: false, content: nil, skip_unallowed_types: false)
content ||= File.read(src_path)
if skip_unallowed_types
return unless check_for_allowed_types(content, raise_error: false)
else
check_for_allowed_types(content)
end
to_remove = extra_tags(src_path)
if to_remove.empty?
logger.info "#{src_path}: only whitelisted tags present, skipping"
return
end
logger.info "#{src_path}: found exif tags to remove: #{to_remove}"
return if dry_run
exec_remove_exif!(src_path)
end
2021-04-15 22:33:27 +05:30
private
2019-04-03 18:18:56 +05:30
def extra_tags(path)
exif_tags(path).keys - ALLOWED_TAGS
end
def remove_and_store(tmpdir, src_path, uploader)
exec_remove_exif!(src_path)
logger.info "#{upload_ref(uploader.upload)}: exif removed, storing"
File.open(src_path, 'r') { |f| uploader.store!(f) }
end
def exec_remove_exif!(path)
# IPTC and XMP-iptcExt groups may keep copyright information so
# we always preserve them
cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path]
output, status = Gitlab::Popen.popen(cmd)
if status != 0
raise "exiftool return code is #{status}: #{output}"
end
if File.size(path) == 0
raise "size of file is 0"
end
# exiftool creates backup of the original file in filename_original
old_path = "#{path}_original"
if File.size(path) == File.size(old_path)
raise "size of sanitized file is same as original size"
end
end
def fetch_upload_to_file(uploader, dir)
# upload is stored into the file with the original name - this filename
# is used by carrierwave when storing the file back to the storage
filename = File.join(dir, uploader.filename)
2021-04-15 22:33:27 +05:30
contents = uploader.read
check_for_allowed_types(contents)
2019-04-03 18:18:56 +05:30
File.open(filename, 'w') do |file|
file.binmode
2021-04-15 22:33:27 +05:30
file.write contents
2019-04-03 18:18:56 +05:30
end
filename
end
2022-05-07 20:08:51 +05:30
def check_for_allowed_types(contents, raise_error: true)
2021-04-15 22:33:27 +05:30
mime_type = Gitlab::Utils::MimeType.from_string(contents)
2022-05-07 20:08:51 +05:30
allowed = ALLOWED_MIME_TYPES.include?(mime_type)
if !allowed && raise_error
2021-04-15 22:33:27 +05:30
raise "File type #{mime_type} not supported. Only supports #{ALLOWED_MIME_TYPES.join(", ")}."
end
2022-05-07 20:08:51 +05:30
allowed
2021-04-15 22:33:27 +05:30
end
2019-04-03 18:18:56 +05:30
def upload_ref(upload)
"#{upload.id}:#{upload.path}"
end
def exif_tags(path)
cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path]
output, status = Gitlab::Popen.popen(cmd)
raise "failed to get exif tags: #{output}" if status != 0
2020-05-24 23:13:21 +05:30
Gitlab::Json.parse(output).first
2019-04-03 18:18:56 +05:30
end
end
end
end