1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Copyright 2013 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import difflib
8import hashlib
9import itertools
10import json
11import os
12import zipfile
13from .pycache import pycache_enabled
14from .pycache import pycache
15
16# When set and a difference is detected, a diff of what changed is printed.
17PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
18
19# An escape hatch that causes all targets to be rebuilt.
20_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
21
22
23def get_new_metadata(input_strings, input_paths):
24    new_metadata = _Metadata()
25    new_metadata.add_strings(input_strings)
26
27    for path in input_paths:
28        if _is_zip_file(path):
29            entries = _extract_zip_entries(path)
30            new_metadata.add_zip_file(path, entries)
31        else:
32            new_metadata.add_file(path, _md5_for_path(path))
33    return new_metadata
34
35
36def get_old_metadata(record_path):
37    old_metadata = None
38    if os.path.exists(record_path):
39        with open(record_path, 'r') as jsonfile:
40            try:
41                old_metadata = _Metadata.from_file(jsonfile)
42            except:  # noqa: E722 pylint: disable=bare-except
43                pass
44    return old_metadata
45
46
47def print_explanations(record_path, changes):
48    if PRINT_EXPLANATIONS:
49        print('=' * 80)
50        print('Target is stale: %s' % record_path)
51        print(changes.describe_difference())
52        print('=' * 80)
53
54
55def call_and_record_if_stale(
56        function,  # pylint: disable=invalid-name
57        record_path=None,
58        input_paths=None,
59        input_strings=None,
60        output_paths=None,
61        force=False,
62        pass_changes=False):
63    """Calls function if outputs are stale.
64
65    Outputs are considered stale if:
66    - any output_paths are missing, or
67    - the contents of any file within input_paths has changed, or
68    - the contents of input_strings has changed.
69
70    To debug which files are out-of-date, set the environment variable:
71        PRINT_MD5_DIFFS=1
72
73    Args:
74      function: The function to call.
75      record_path: Path to record metadata.
76        Defaults to output_paths[0] + '.md5.stamp'
77      input_paths: List of paths to calculate a md5 sum on.
78      input_strings: List of strings to record verbatim.
79      output_paths: List of output paths.
80      force: Whether to treat outputs as missing regardless of whether they
81        actually are.
82      pass_changes: Whether to pass a Changes instance to |function|.
83    """
84    assert record_path or output_paths
85    input_paths = input_paths or []
86    input_strings = input_strings or []
87    output_paths = output_paths or []
88
89    new_metadata = get_new_metadata(input_strings, input_paths)
90    force = force or _FORCE_REBUILD
91    missing_outputs = [
92        x for x in output_paths if force or not os.path.exists(x)
93    ]
94
95    if pycache_enabled:
96        # Input strings, input files and outputs names together compose
97        # cache manifest, which is the only identifier of a python action.
98        manifest = '-'.join(
99            [new_metadata.strings_md5(),
100             new_metadata.files_md5()] + sorted(output_paths))
101        record_path = pycache.get_manifest_path('{}.manifest'.format(manifest))
102        old_metadata = get_old_metadata(record_path)
103    else:
104        record_path = record_path or output_paths[0] + '.md5.stamp'
105        # When outputs are missing, don't bother gathering change information.
106        if not missing_outputs:
107            old_metadata = get_old_metadata(record_path)
108        else:
109            old_metadata = None
110
111    changes = Changes(old_metadata, new_metadata, force, missing_outputs)
112    if not changes.has_changes():
113        if not pycache_enabled:
114            return
115        if pycache_enabled and pycache.retrieve(output_paths, prefix=manifest):
116            return
117
118    print_explanations(record_path, changes)
119
120    args = (changes, ) if pass_changes else ()
121    function(*args)
122    if pycache_enabled:
123        try:
124            pycache.report_cache_stat('cache_miss')
125        except:  # noqa: E722 pylint: disable=bare-except
126            pass
127        pycache.save(output_paths, prefix=manifest)
128
129    with open(record_path, 'w') as record:
130        new_metadata.to_file(record)
131
132
133class Changes(object):
134    """Provides and API for querying what changed between runs."""
135    def __init__(self, old_metadata, new_metadata, force, missing_outputs):
136        self.old_metadata = old_metadata
137        self.new_metadata = new_metadata
138        self.force = force
139        self.missing_outputs = missing_outputs
140
141    def has_changes(self):
142        """Returns whether any changes exist."""
143        return (
144            self.force or not self.old_metadata or
145            self.old_metadata.strings_md5() != self.new_metadata.strings_md5()
146            or self.old_metadata.files_md5() != self.new_metadata.files_md5())
147
148    def added_or_modified_only(self):
149        """Returns whether the only changes were from added or modified (sub)files.
150
151        No missing outputs, no removed paths/subpaths.
152        """
153        if (self.force or not self.old_metadata
154                or self.old_metadata.strings_md5() !=
155                self.new_metadata.strings_md5()):
156            return False
157        if any(self.iter_removed_paths()):
158            return False
159        for path in self.iter_modified_paths():
160            if any(self.iter_removed_subpaths(path)):
161                return False
162        return True
163
164    def iter_all_paths(self):
165        """Generator for paths."""
166        return self.new_metadata.iter_paths()
167
168    def iter_all_subpaths(self, path):
169        """Generator for subpaths."""
170        return self.new_metadata.iter_subpaths(path)
171
172    def iter_added_paths(self):
173        """Generator for paths that were added."""
174        for path in self.new_metadata.iter_paths():
175            if self._get_old_tag(path) is None:
176                yield path
177
178    def iter_added_subpaths(self, path):
179        """Generator for paths that were added within the given zip file."""
180        for subpath in self.new_metadata.iter_subpaths(path):
181            if self._get_old_tag(path, subpath) is None:
182                yield subpath
183
184    def iter_removed_paths(self):
185        """Generator for paths that were removed."""
186        if self.old_metadata:
187            for path in self.old_metadata.iter_paths():
188                if self.new_metadata.get_tag(path) is None:
189                    yield path
190
191    def iter_removed_subpaths(self, path):
192        """Generator for paths that were removed within the given zip file."""
193        if self.old_metadata:
194            for subpath in self.old_metadata.iter_subpaths(path):
195                if self.new_metadata.get_tag(path, subpath) is None:
196                    yield subpath
197
198    def iter_modified_paths(self):
199        """Generator for paths whose contents have changed."""
200        for path in self.new_metadata.iter_paths():
201            old_tag = self._get_old_tag(path)
202            new_tag = self.new_metadata.get_tag(path)
203            if old_tag is not None and old_tag != new_tag:
204                yield path
205
206    def iter_modified_subpaths(self, path):
207        """Generator for paths within a zip file whose contents have changed."""
208        for subpath in self.new_metadata.iter_subpaths(path):
209            old_tag = self._get_old_tag(path, subpath)
210            new_tag = self.new_metadata.get_tag(path, subpath)
211            if old_tag is not None and old_tag != new_tag:
212                yield subpath
213
214    def iter_changed_paths(self):
215        """Generator for all changed paths (added/removed/modified)."""
216        return itertools.chain(self.iter_removed_paths(),
217                               self.iter_modified_paths(),
218                               self.iter_added_paths())
219
220    def iter_changed_subpaths(self, path):
221        """Generator for paths within a zip that were added/removed/modified."""
222        return itertools.chain(self.iter_removed_subpaths(path),
223                               self.iter_modified_subpaths(path),
224                               self.iter_added_subpaths(path))
225
226    def describe_difference(self):
227        """Returns a human-readable description of what changed."""
228        if self.force:
229            return 'force=True'
230        elif self.old_metadata is None:
231            return 'Previous stamp file not found.'
232
233        if self.old_metadata.strings_md5() != self.new_metadata.strings_md5():
234            ndiff = difflib.ndiff(self.old_metadata.get_strings(),
235                                  self.new_metadata.get_strings())
236            changed = [s for s in ndiff if not s.startswith(' ')]
237            return 'Input strings changed:\n  ' + '\n  '.join(changed)
238
239        if self.old_metadata.files_md5() == self.new_metadata.files_md5():
240            return "There's no difference."
241
242        lines = []
243        lines.extend('Added: {}'.format(p for p in self.iter_added_paths()))
244        lines.extend('Removed: {}'.format(p
245                                          for p in self.iter_removed_paths()))
246        for path in self.iter_modified_paths():
247            lines.append('Modified: {}'.format(path))
248            lines.extend('  -> Subpath added: {}'.format(
249                p for p in self.iter_added_subpaths(path)))
250            lines.extend('  -> Subpath removed: {}'.format(
251                p for p in self.iter_removed_subpaths(path)))
252            lines.extend('  -> Subpath modified: {}'.format(
253                p for p in self.iter_modified_subpaths(path)))
254        if lines:
255            return 'Input files changed:\n  {}'.format('\n  '.join(lines))
256
257        if self.missing_outputs:
258            return 'Outputs do not exist:\n  {}'.format('\n  '.join(
259                self.missing_outputs))
260
261        return 'I have no idea what changed (there is a bug).'
262
263    def _get_old_tag(self, path, subpath=None):
264        return self.old_metadata and self.old_metadata.get_tag(path, subpath)
265
266
267class _Metadata(object):
268    """Data model for tracking change metadata."""
269    def __init__(self):
270        self._files_md5 = None
271        self._strings_md5 = None
272        self._files = []
273        self._strings = []
274        # Map of (path, subpath) -> entry. Created upon first call to _get_entry().
275        self._file_map = None
276
277    @classmethod
278    def from_file(cls, fileobj):
279        """Returns a _Metadata initialized from a file object."""
280        ret = cls()
281        obj = json.load(fileobj)
282        ret._files_md5 = obj['files-md5']
283        ret._strings_md5 = obj['strings-md5']
284        ret._files = obj['input-files']
285        ret._strings = obj['input-strings']
286        return ret
287
288    def to_file(self, fileobj):
289        """Serializes metadata to the given file object."""
290        obj = {
291            "files-md5": self.files_md5(),
292            "strings-md5": self.strings_md5(),
293            "input-files": self._files,
294            "input-strings": self._strings,
295        }
296        json.dump(obj, fileobj, indent=2, sort_keys=True)
297
298    def add_strings(self, values):
299        self._assert_not_queried()
300        self._strings.extend(str(v) for v in values)
301
302    def add_file(self, path, tag):
303        """Adds metadata for a non-zip file.
304
305        Args:
306          path: Path to the file.
307          tag: A short string representative of the file contents.
308        """
309        self._assert_not_queried()
310        self._files.append({
311            'path': path,
312            'tag': tag,
313        })
314
315    def add_zip_file(self, path, entries):
316        """Adds metadata for a zip file.
317
318        Args:
319          path: Path to the file.
320          entries: List of (subpath, tag) tuples for entries within the zip.
321        """
322        self._assert_not_queried()
323        tag = _compute_inline_md5(
324            itertools.chain((e[0] for e in entries), (e[1] for e in entries)))
325        self._files.append({
326            'path':
327            path,
328            'tag':
329            tag,
330            'entries': [{
331                "path": e[0],
332                "tag": e[1]
333            } for e in entries],
334        })
335
336    def get_strings(self):
337        """Returns the list of input strings."""
338        return self._strings
339
340    def files_md5(self):
341        """Lazily computes and returns the aggregate md5 of input files."""
342        if self._files_md5 is None:
343            # Omit paths from md5 since temporary files have random names.
344            self._files_md5 = _compute_inline_md5(
345                self.get_tag(p) for p in sorted(self.iter_paths()))
346        return self._files_md5
347
348    def strings_md5(self):
349        """Lazily computes and returns the aggregate md5 of input strings."""
350        if self._strings_md5 is None:
351            self._strings_md5 = _compute_inline_md5(self._strings)
352        return self._strings_md5
353
354    def get_tag(self, path, subpath=None):
355        """Returns the tag for the given path / subpath."""
356        ret = self._get_entry(path, subpath)
357        return ret and ret['tag']
358
359    def iter_paths(self):
360        """Returns a generator for all top-level paths."""
361        return (e['path'] for e in self._files)
362
363    def iter_subpaths(self, path):
364        """Returns a generator for all subpaths in the given zip.
365
366        If the given path is not a zip file or doesn't exist, returns an empty
367        iterable.
368        """
369        outer_entry = self._get_entry(path)
370        if not outer_entry:
371            return ()
372        subentries = outer_entry.get('entries', [])
373        return (entry['path'] for entry in subentries)
374
375    def _assert_not_queried(self):
376        assert self._files_md5 is None
377        assert self._strings_md5 is None
378        assert self._file_map is None
379
380    def _get_entry(self, path, subpath=None):
381        """Returns the JSON entry for the given path / subpath."""
382        if self._file_map is None:
383            self._file_map = {}
384            for entry in self._files:
385                self._file_map[(entry['path'], None)] = entry
386                for subentry in entry.get('entries', ()):
387                    self._file_map[(entry['path'],
388                                    subentry['path'])] = subentry
389        return self._file_map.get((path, subpath))
390
391
392def _update_md5_for_file(md5, path, block_size=2**16):
393    # record md5 of linkto for dead link.
394    if os.path.islink(path):
395        linkto = os.readlink(path)
396        if not os.path.exists(linkto):
397            md5.update(linkto.encode())
398            return
399
400    with open(path, 'rb') as infile:
401        while True:
402            data = infile.read(block_size)
403            if not data:
404                break
405            md5.update(data)
406
407
408def _update_md5_for_directory(md5, dir_path):
409    for root, _, files in os.walk(dir_path):
410        for f in files:
411            _update_md5_for_file(md5, os.path.join(root, f))
412
413
414def _md5_for_path(path):
415    md5 = hashlib.md5()
416    if os.path.isdir(path):
417        _update_md5_for_directory(md5, path)
418    else:
419        _update_md5_for_file(md5, path)
420    return md5.hexdigest()
421
422
423def _compute_inline_md5(iterable):
424    """Computes the md5 of the concatenated parameters."""
425    md5 = hashlib.md5()
426    for item in iterable:
427        md5.update(str(item).encode())
428    return md5.hexdigest()
429
430
431def _is_zip_file(path):
432    """Returns whether to treat the given file as a zip file."""
433    return path[-4:] in ('.zip')
434
435
436def _extract_zip_entries(path):
437    """Returns a list of (path, CRC32) of all files within |path|."""
438    entries = []
439    with zipfile.ZipFile(path) as zip_file:
440        for zip_info in zip_file.infolist():
441            # Skip directories and empty files.
442            if zip_info.CRC:
443                entries.append(
444                    (zip_info.filename, zip_info.CRC + zip_info.compress_type))
445    return entries
446