1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# Copyright 2013 The Chromium Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7import difflib 8import hashlib 9import itertools 10import json 11import os 12import zipfile 13from .pycache import pycache_enabled 14from .pycache import pycache 15 16# When set and a difference is detected, a diff of what changed is printed. 17PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0)) 18 19# An escape hatch that causes all targets to be rebuilt. 20_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0)) 21 22 23def get_new_metadata(input_strings, input_paths): 24 new_metadata = _Metadata() 25 new_metadata.add_strings(input_strings) 26 27 for path in input_paths: 28 if _is_zip_file(path): 29 entries = _extract_zip_entries(path) 30 new_metadata.add_zip_file(path, entries) 31 else: 32 new_metadata.add_file(path, _md5_for_path(path)) 33 return new_metadata 34 35 36def get_old_metadata(record_path): 37 old_metadata = None 38 if os.path.exists(record_path): 39 with open(record_path, 'r') as jsonfile: 40 try: 41 old_metadata = _Metadata.from_file(jsonfile) 42 except: # noqa: E722 pylint: disable=bare-except 43 pass 44 return old_metadata 45 46 47def print_explanations(record_path, changes): 48 if PRINT_EXPLANATIONS: 49 print('=' * 80) 50 print('Target is stale: %s' % record_path) 51 print(changes.describe_difference()) 52 print('=' * 80) 53 54 55def call_and_record_if_stale( 56 function, # pylint: disable=invalid-name 57 record_path=None, 58 input_paths=None, 59 input_strings=None, 60 output_paths=None, 61 force=False, 62 pass_changes=False): 63 """Calls function if outputs are stale. 64 65 Outputs are considered stale if: 66 - any output_paths are missing, or 67 - the contents of any file within input_paths has changed, or 68 - the contents of input_strings has changed. 69 70 To debug which files are out-of-date, set the environment variable: 71 PRINT_MD5_DIFFS=1 72 73 Args: 74 function: The function to call. 75 record_path: Path to record metadata. 76 Defaults to output_paths[0] + '.md5.stamp' 77 input_paths: List of paths to calculate a md5 sum on. 78 input_strings: List of strings to record verbatim. 79 output_paths: List of output paths. 80 force: Whether to treat outputs as missing regardless of whether they 81 actually are. 82 pass_changes: Whether to pass a Changes instance to |function|. 83 """ 84 assert record_path or output_paths 85 input_paths = input_paths or [] 86 input_strings = input_strings or [] 87 output_paths = output_paths or [] 88 89 new_metadata = get_new_metadata(input_strings, input_paths) 90 force = force or _FORCE_REBUILD 91 missing_outputs = [ 92 x for x in output_paths if force or not os.path.exists(x) 93 ] 94 95 if pycache_enabled: 96 # Input strings, input files and outputs names together compose 97 # cache manifest, which is the only identifier of a python action. 98 manifest = '-'.join( 99 [new_metadata.strings_md5(), 100 new_metadata.files_md5()] + sorted(output_paths)) 101 record_path = pycache.get_manifest_path('{}.manifest'.format(manifest)) 102 old_metadata = get_old_metadata(record_path) 103 else: 104 record_path = record_path or output_paths[0] + '.md5.stamp' 105 # When outputs are missing, don't bother gathering change information. 106 if not missing_outputs: 107 old_metadata = get_old_metadata(record_path) 108 else: 109 old_metadata = None 110 111 changes = Changes(old_metadata, new_metadata, force, missing_outputs) 112 if not changes.has_changes(): 113 if not pycache_enabled: 114 return 115 if pycache_enabled and pycache.retrieve(output_paths, prefix=manifest): 116 return 117 118 print_explanations(record_path, changes) 119 120 args = (changes, ) if pass_changes else () 121 function(*args) 122 if pycache_enabled: 123 try: 124 pycache.report_cache_stat('cache_miss') 125 except: # noqa: E722 pylint: disable=bare-except 126 pass 127 pycache.save(output_paths, prefix=manifest) 128 129 with open(record_path, 'w') as record: 130 new_metadata.to_file(record) 131 132 133class Changes(object): 134 """Provides and API for querying what changed between runs.""" 135 def __init__(self, old_metadata, new_metadata, force, missing_outputs): 136 self.old_metadata = old_metadata 137 self.new_metadata = new_metadata 138 self.force = force 139 self.missing_outputs = missing_outputs 140 141 def has_changes(self): 142 """Returns whether any changes exist.""" 143 return ( 144 self.force or not self.old_metadata or 145 self.old_metadata.strings_md5() != self.new_metadata.strings_md5() 146 or self.old_metadata.files_md5() != self.new_metadata.files_md5()) 147 148 def added_or_modified_only(self): 149 """Returns whether the only changes were from added or modified (sub)files. 150 151 No missing outputs, no removed paths/subpaths. 152 """ 153 if (self.force or not self.old_metadata 154 or self.old_metadata.strings_md5() != 155 self.new_metadata.strings_md5()): 156 return False 157 if any(self.iter_removed_paths()): 158 return False 159 for path in self.iter_modified_paths(): 160 if any(self.iter_removed_subpaths(path)): 161 return False 162 return True 163 164 def iter_all_paths(self): 165 """Generator for paths.""" 166 return self.new_metadata.iter_paths() 167 168 def iter_all_subpaths(self, path): 169 """Generator for subpaths.""" 170 return self.new_metadata.iter_subpaths(path) 171 172 def iter_added_paths(self): 173 """Generator for paths that were added.""" 174 for path in self.new_metadata.iter_paths(): 175 if self._get_old_tag(path) is None: 176 yield path 177 178 def iter_added_subpaths(self, path): 179 """Generator for paths that were added within the given zip file.""" 180 for subpath in self.new_metadata.iter_subpaths(path): 181 if self._get_old_tag(path, subpath) is None: 182 yield subpath 183 184 def iter_removed_paths(self): 185 """Generator for paths that were removed.""" 186 if self.old_metadata: 187 for path in self.old_metadata.iter_paths(): 188 if self.new_metadata.get_tag(path) is None: 189 yield path 190 191 def iter_removed_subpaths(self, path): 192 """Generator for paths that were removed within the given zip file.""" 193 if self.old_metadata: 194 for subpath in self.old_metadata.iter_subpaths(path): 195 if self.new_metadata.get_tag(path, subpath) is None: 196 yield subpath 197 198 def iter_modified_paths(self): 199 """Generator for paths whose contents have changed.""" 200 for path in self.new_metadata.iter_paths(): 201 old_tag = self._get_old_tag(path) 202 new_tag = self.new_metadata.get_tag(path) 203 if old_tag is not None and old_tag != new_tag: 204 yield path 205 206 def iter_modified_subpaths(self, path): 207 """Generator for paths within a zip file whose contents have changed.""" 208 for subpath in self.new_metadata.iter_subpaths(path): 209 old_tag = self._get_old_tag(path, subpath) 210 new_tag = self.new_metadata.get_tag(path, subpath) 211 if old_tag is not None and old_tag != new_tag: 212 yield subpath 213 214 def iter_changed_paths(self): 215 """Generator for all changed paths (added/removed/modified).""" 216 return itertools.chain(self.iter_removed_paths(), 217 self.iter_modified_paths(), 218 self.iter_added_paths()) 219 220 def iter_changed_subpaths(self, path): 221 """Generator for paths within a zip that were added/removed/modified.""" 222 return itertools.chain(self.iter_removed_subpaths(path), 223 self.iter_modified_subpaths(path), 224 self.iter_added_subpaths(path)) 225 226 def describe_difference(self): 227 """Returns a human-readable description of what changed.""" 228 if self.force: 229 return 'force=True' 230 elif self.old_metadata is None: 231 return 'Previous stamp file not found.' 232 233 if self.old_metadata.strings_md5() != self.new_metadata.strings_md5(): 234 ndiff = difflib.ndiff(self.old_metadata.get_strings(), 235 self.new_metadata.get_strings()) 236 changed = [s for s in ndiff if not s.startswith(' ')] 237 return 'Input strings changed:\n ' + '\n '.join(changed) 238 239 if self.old_metadata.files_md5() == self.new_metadata.files_md5(): 240 return "There's no difference." 241 242 lines = [] 243 lines.extend('Added: {}'.format(p for p in self.iter_added_paths())) 244 lines.extend('Removed: {}'.format(p 245 for p in self.iter_removed_paths())) 246 for path in self.iter_modified_paths(): 247 lines.append('Modified: {}'.format(path)) 248 lines.extend(' -> Subpath added: {}'.format( 249 p for p in self.iter_added_subpaths(path))) 250 lines.extend(' -> Subpath removed: {}'.format( 251 p for p in self.iter_removed_subpaths(path))) 252 lines.extend(' -> Subpath modified: {}'.format( 253 p for p in self.iter_modified_subpaths(path))) 254 if lines: 255 return 'Input files changed:\n {}'.format('\n '.join(lines)) 256 257 if self.missing_outputs: 258 return 'Outputs do not exist:\n {}'.format('\n '.join( 259 self.missing_outputs)) 260 261 return 'I have no idea what changed (there is a bug).' 262 263 def _get_old_tag(self, path, subpath=None): 264 return self.old_metadata and self.old_metadata.get_tag(path, subpath) 265 266 267class _Metadata(object): 268 """Data model for tracking change metadata.""" 269 def __init__(self): 270 self._files_md5 = None 271 self._strings_md5 = None 272 self._files = [] 273 self._strings = [] 274 # Map of (path, subpath) -> entry. Created upon first call to _get_entry(). 275 self._file_map = None 276 277 @classmethod 278 def from_file(cls, fileobj): 279 """Returns a _Metadata initialized from a file object.""" 280 ret = cls() 281 obj = json.load(fileobj) 282 ret._files_md5 = obj['files-md5'] 283 ret._strings_md5 = obj['strings-md5'] 284 ret._files = obj['input-files'] 285 ret._strings = obj['input-strings'] 286 return ret 287 288 def to_file(self, fileobj): 289 """Serializes metadata to the given file object.""" 290 obj = { 291 "files-md5": self.files_md5(), 292 "strings-md5": self.strings_md5(), 293 "input-files": self._files, 294 "input-strings": self._strings, 295 } 296 json.dump(obj, fileobj, indent=2, sort_keys=True) 297 298 def add_strings(self, values): 299 self._assert_not_queried() 300 self._strings.extend(str(v) for v in values) 301 302 def add_file(self, path, tag): 303 """Adds metadata for a non-zip file. 304 305 Args: 306 path: Path to the file. 307 tag: A short string representative of the file contents. 308 """ 309 self._assert_not_queried() 310 self._files.append({ 311 'path': path, 312 'tag': tag, 313 }) 314 315 def add_zip_file(self, path, entries): 316 """Adds metadata for a zip file. 317 318 Args: 319 path: Path to the file. 320 entries: List of (subpath, tag) tuples for entries within the zip. 321 """ 322 self._assert_not_queried() 323 tag = _compute_inline_md5( 324 itertools.chain((e[0] for e in entries), (e[1] for e in entries))) 325 self._files.append({ 326 'path': 327 path, 328 'tag': 329 tag, 330 'entries': [{ 331 "path": e[0], 332 "tag": e[1] 333 } for e in entries], 334 }) 335 336 def get_strings(self): 337 """Returns the list of input strings.""" 338 return self._strings 339 340 def files_md5(self): 341 """Lazily computes and returns the aggregate md5 of input files.""" 342 if self._files_md5 is None: 343 # Omit paths from md5 since temporary files have random names. 344 self._files_md5 = _compute_inline_md5( 345 self.get_tag(p) for p in sorted(self.iter_paths())) 346 return self._files_md5 347 348 def strings_md5(self): 349 """Lazily computes and returns the aggregate md5 of input strings.""" 350 if self._strings_md5 is None: 351 self._strings_md5 = _compute_inline_md5(self._strings) 352 return self._strings_md5 353 354 def get_tag(self, path, subpath=None): 355 """Returns the tag for the given path / subpath.""" 356 ret = self._get_entry(path, subpath) 357 return ret and ret['tag'] 358 359 def iter_paths(self): 360 """Returns a generator for all top-level paths.""" 361 return (e['path'] for e in self._files) 362 363 def iter_subpaths(self, path): 364 """Returns a generator for all subpaths in the given zip. 365 366 If the given path is not a zip file or doesn't exist, returns an empty 367 iterable. 368 """ 369 outer_entry = self._get_entry(path) 370 if not outer_entry: 371 return () 372 subentries = outer_entry.get('entries', []) 373 return (entry['path'] for entry in subentries) 374 375 def _assert_not_queried(self): 376 assert self._files_md5 is None 377 assert self._strings_md5 is None 378 assert self._file_map is None 379 380 def _get_entry(self, path, subpath=None): 381 """Returns the JSON entry for the given path / subpath.""" 382 if self._file_map is None: 383 self._file_map = {} 384 for entry in self._files: 385 self._file_map[(entry['path'], None)] = entry 386 for subentry in entry.get('entries', ()): 387 self._file_map[(entry['path'], 388 subentry['path'])] = subentry 389 return self._file_map.get((path, subpath)) 390 391 392def _update_md5_for_file(md5, path, block_size=2**16): 393 # record md5 of linkto for dead link. 394 if os.path.islink(path): 395 linkto = os.readlink(path) 396 if not os.path.exists(linkto): 397 md5.update(linkto.encode()) 398 return 399 400 with open(path, 'rb') as infile: 401 while True: 402 data = infile.read(block_size) 403 if not data: 404 break 405 md5.update(data) 406 407 408def _update_md5_for_directory(md5, dir_path): 409 for root, _, files in os.walk(dir_path): 410 for f in files: 411 _update_md5_for_file(md5, os.path.join(root, f)) 412 413 414def _md5_for_path(path): 415 md5 = hashlib.md5() 416 if os.path.isdir(path): 417 _update_md5_for_directory(md5, path) 418 else: 419 _update_md5_for_file(md5, path) 420 return md5.hexdigest() 421 422 423def _compute_inline_md5(iterable): 424 """Computes the md5 of the concatenated parameters.""" 425 md5 = hashlib.md5() 426 for item in iterable: 427 md5.update(str(item).encode()) 428 return md5.hexdigest() 429 430 431def _is_zip_file(path): 432 """Returns whether to treat the given file as a zip file.""" 433 return path[-4:] in ('.zip') 434 435 436def _extract_zip_entries(path): 437 """Returns a list of (path, CRC32) of all files within |path|.""" 438 entries = [] 439 with zipfile.ZipFile(path) as zip_file: 440 for zip_info in zip_file.infolist(): 441 # Skip directories and empty files. 442 if zip_info.CRC: 443 entries.append( 444 (zip_info.filename, zip_info.CRC + zip_info.compress_type)) 445 return entries 446