fune/toolkit/mozapps/installer/find-dupes.py
Ricky Stewart e2fe57521b Bug 1638060 - Standardize interface of mozfile classes as bytes-based streams r=glandium
At the beginning of the Python 3 migration (circa bug 1602540), we made an update to the interface of `mozpack/files.py` in the direction of aligning with Python 3's built-in `file` support; namely, that opening a file in text mode returns a stream of `str` (text), and that opening a file in binary mode returns a stream of `bytes`. This was deemed to be more trouble than it was worth. This patch undoes all of those changes to the interface in favor of moving back to the Python 2 style, where all files are bytestreams.

Differential Revision: https://phabricator.services.mozilla.com/D75424
2020-05-22 01:11:29 +00:00

131 lines
4.5 KiB
Python

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from __future__ import absolute_import, unicode_literals, print_function
import sys
import hashlib
import re
import os
import functools
from mozbuild.preprocessor import Preprocessor
from mozbuild.util import DefinesAction
from mozpack.packager.unpack import UnpackFinder
from mozpack.files import DeflatedFile
from collections import OrderedDict
import six
from six import StringIO
import argparse
import buildconfig
'''
Find files duplicated in a given packaged directory, independently of its
package format.
'''
def normalize_osx_path(p):
'''
Strips the first 3 elements of an OSX app path
>>> normalize_osx_path('Nightly.app/foo/bar/baz')
'baz'
'''
bits = p.split('/')
if len(bits) > 3 and bits[0].endswith('.app'):
return '/'.join(bits[3:])
return p
def is_l10n_file(path):
return ('/locale/' in path or
'/localization/' in path or
path.startswith('localization/'))
def normalize_path(p):
return normalize_osx_path(p)
def find_dupes(source, allowed_dupes, bail=True):
md5_chunk_size = 1024*10
allowed_dupes = set(allowed_dupes)
md5s = OrderedDict()
for p, f in UnpackFinder(source):
md5 = hashlib.md5()
content_size = 0
for buf in iter(functools.partial(f.open().read, md5_chunk_size), b''):
md5.update(six.ensure_binary(buf))
content_size += len(six.ensure_binary(buf))
m = md5.digest()
if m not in md5s:
if isinstance(f, DeflatedFile):
compressed = f.file.compressed_size
else:
compressed = content_size
md5s[m] = (content_size, compressed, [])
md5s[m][2].append(p)
total = 0
total_compressed = 0
num_dupes = 0
unexpected_dupes = []
for m, (size, compressed, paths) in sorted(six.iteritems(md5s),
key=lambda x: x[1][1]):
if len(paths) > 1:
_compressed = ' (%d compressed)' % compressed if compressed != size else ''
_times = ' (%d times)' % (len(paths) - 1) if len(paths) > 2 else ''
print('Duplicates {} bytes{}{}:'.format(size, _compressed, _times))
print(''.join(' %s\n' % p for p in paths))
total += (len(paths) - 1) * size
total_compressed += (len(paths) - 1) * compressed
num_dupes += 1
for p in paths:
if not is_l10n_file(p) and normalize_path(p) not in allowed_dupes:
unexpected_dupes.append(p)
if num_dupes:
total_compressed = '%d compressed' % total_compressed \
if total_compressed != total else 'uncompressed'
print("WARNING: Found {} duplicated files taking {} bytes ({})".format(
num_dupes, total, total_compressed))
if unexpected_dupes:
errortype = "ERROR" if bail else "WARNING"
print("{}: The following duplicated files are not allowed:".format(errortype))
print("\n".join(unexpected_dupes))
if bail:
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description='Find duplicate files in directory.')
parser.add_argument('--warning', '-w', action='store_true',
help='Only warn about duplicates, do not exit with an error')
parser.add_argument('--file', '-f', action='append', dest='dupes_files', default=[],
help='Add exceptions to the duplicate list from this file')
parser.add_argument('-D', action=DefinesAction)
parser.add_argument('-U', action='append', default=[])
parser.add_argument('directory',
help='The directory to check for duplicates in')
args = parser.parse_args()
allowed_dupes = []
for filename in args.dupes_files:
pp = Preprocessor()
pp.context.update(buildconfig.defines['ALLDEFINES'])
if args.D:
pp.context.update(args.D)
for undefine in args.U:
if undefine in pp.context:
del pp.context[undefine]
pp.out = StringIO()
pp.do_filter('substitution')
pp.do_include(filename)
allowed_dupes.extend([line.partition('#')[0].rstrip()
for line in pp.out.getvalue().splitlines()])
find_dupes(args.directory, bail=not args.warning, allowed_dupes=allowed_dupes)
if __name__ == "__main__":
main()