1#!/usr/bin/env python 2 3import collections 4import copy 5import glob 6from os import path 7import re 8import sys 9from xml.etree import ElementTree 10 11from fontTools import ttLib 12 13EMOJI_VS = 0xFE0F 14 15LANG_TO_SCRIPT = { 16 'af': 'Latn', 17 'as': 'Beng', 18 'am': 'Latn', 19 'be': 'Cyrl', 20 'bg': 'Cyrl', 21 'bn': 'Beng', 22 'cs': 'Latn', 23 'cu': 'Cyrl', 24 'cy': 'Latn', 25 'da': 'Latn', 26 'de': 'Latn', 27 'el': 'Latn', 28 'en': 'Latn', 29 'es': 'Latn', 30 'et': 'Latn', 31 'eu': 'Latn', 32 'fr': 'Latn', 33 'ga': 'Latn', 34 'gl': 'Latn', 35 'gu': 'Gujr', 36 'hi': 'Deva', 37 'hr': 'Latn', 38 'hu': 'Latn', 39 'hy': 'Armn', 40 'it': 'Latn', 41 'ja': 'Jpan', 42 'ka': 'Latn', 43 'kn': 'Knda', 44 'ko': 'Kore', 45 'la': 'Latn', 46 'lt': 'Latn', 47 'lv': 'Latn', 48 'ml': 'Mlym', 49 'mn': 'Cyrl', 50 'mr': 'Deva', 51 'nb': 'Latn', 52 'nl': 'Latn', 53 'nn': 'Latn', 54 'or': 'Orya', 55 'pa': 'Guru', 56 'pt': 'Latn', 57 'pl': 'Latn', 58 'ru': 'Latn', 59 'sk': 'Latn', 60 'sl': 'Latn', 61 'sq': 'Latn', 62 'sv': 'Latn', 63 'ta': 'Taml', 64 'te': 'Telu', 65 'tk': 'Latn', 66 'uk': 'Latn', 67} 68 69def lang_to_script(lang_code): 70 lang = lang_code.lower() 71 while lang not in LANG_TO_SCRIPT: 72 hyphen_idx = lang.rfind('-') 73 assert hyphen_idx != -1, ( 74 'We do not know what script the "%s" language is written in.' 75 % lang_code) 76 assumed_script = lang[hyphen_idx+1:] 77 if len(assumed_script) == 4 and assumed_script.isalpha(): 78 # This is actually the script 79 return assumed_script.title() 80 lang = lang[:hyphen_idx] 81 return LANG_TO_SCRIPT[lang] 82 83 84def printable(inp): 85 if type(inp) is set: # set of character sequences 86 return '{' + ', '.join([printable(seq) for seq in inp]) + '}' 87 if type(inp) is tuple: # character sequence 88 return '<' + (', '.join([printable(ch) for ch in inp])) + '>' 89 else: # single character 90 return 'U+%04X' % inp 91 92 93def open_font(font): 94 font_file, index = font 95 font_path = path.join(_fonts_dir, font_file) 96 if index is not None: 97 return ttLib.TTFont(font_path, fontNumber=index) 98 else: 99 return ttLib.TTFont(font_path) 100 101 102def get_best_cmap(font): 103 ttfont = open_font(font) 104 all_unicode_cmap = None 105 bmp_cmap = None 106 for cmap in ttfont['cmap'].tables: 107 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 108 if specifier == (4, 3, 1): 109 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, ) 110 bmp_cmap = cmap 111 elif specifier == (12, 3, 10): 112 assert all_unicode_cmap is None, ( 113 'More than one UCS-4 cmap in %s' % (font, )) 114 all_unicode_cmap = cmap 115 116 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap 117 118 119def get_variation_sequences_cmap(font): 120 ttfont = open_font(font) 121 vs_cmap = None 122 for cmap in ttfont['cmap'].tables: 123 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 124 if specifier == (14, 0, 5): 125 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, ) 126 vs_cmap = cmap 127 return vs_cmap 128 129 130def get_emoji_map(font): 131 # Add normal characters 132 emoji_map = copy.copy(get_best_cmap(font)) 133 reverse_cmap = {glyph: code for code, glyph in emoji_map.items() if not contains_pua(code) } 134 135 # Add variation sequences 136 vs_cmap = get_variation_sequences_cmap(font) 137 if vs_cmap: 138 for vs in vs_cmap.uvsDict: 139 for base, glyph in vs_cmap.uvsDict[vs]: 140 if glyph is None: 141 emoji_map[(base, vs)] = emoji_map[base] 142 else: 143 emoji_map[(base, vs)] = glyph 144 145 # Add GSUB rules 146 ttfont = open_font(font) 147 for lookup in ttfont['GSUB'].table.LookupList.Lookup: 148 if lookup.LookupType != 4: 149 # Other lookups are used in the emoji font for fallback. 150 # We ignore them for now. 151 continue 152 for subtable in lookup.SubTable: 153 ligatures = subtable.ligatures 154 for first_glyph in ligatures: 155 for ligature in ligatures[first_glyph]: 156 sequence = [first_glyph] + ligature.Component 157 sequence = [reverse_cmap[glyph] for glyph in sequence] 158 sequence = tuple(sequence) 159 # Make sure no starting subsequence of 'sequence' has been 160 # seen before. 161 for sub_len in range(2, len(sequence)+1): 162 subsequence = sequence[:sub_len] 163 assert subsequence not in emoji_map 164 emoji_map[sequence] = ligature.LigGlyph 165 166 return emoji_map 167 168 169def assert_font_supports_any_of_chars(font, chars): 170 best_cmap = get_best_cmap(font) 171 for char in chars: 172 if char in best_cmap: 173 return 174 sys.exit('None of characters in %s were found in %s' % (chars, font)) 175 176 177def assert_font_supports_all_of_chars(font, chars): 178 best_cmap = get_best_cmap(font) 179 for char in chars: 180 assert char in best_cmap, ( 181 'U+%04X was not found in %s' % (char, font)) 182 183 184def assert_font_supports_none_of_chars(font, chars, fallbackName): 185 best_cmap = get_best_cmap(font) 186 for char in chars: 187 if fallbackName: 188 assert char not in best_cmap, 'U+%04X was found in %s' % (char, font) 189 else: 190 assert char not in best_cmap, ( 191 'U+%04X was found in %s in fallback %s' % (char, font, fallbackName)) 192 193 194def assert_font_supports_all_sequences(font, sequences): 195 vs_dict = get_variation_sequences_cmap(font).uvsDict 196 for base, vs in sorted(sequences): 197 assert vs in vs_dict and (base, None) in vs_dict[vs], ( 198 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font)) 199 200 201def check_hyphens(hyphens_dir): 202 # Find all the scripts that need automatic hyphenation 203 scripts = set() 204 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')): 205 hyb_file = path.basename(hyb_file) 206 assert hyb_file.startswith('hyph-'), ( 207 'Unknown hyphenation file %s' % hyb_file) 208 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')] 209 scripts.add(lang_to_script(lang_code)) 210 211 HYPHENS = {0x002D, 0x2010} 212 for script in scripts: 213 fonts = _script_to_font_map[script] 214 assert fonts, 'No fonts found for the "%s" script' % script 215 for font in fonts: 216 assert_font_supports_any_of_chars(font, HYPHENS) 217 218 219class FontRecord(object): 220 def __init__(self, name, psName, scripts, variant, weight, style, fallback_for, font): 221 self.name = name 222 self.psName = psName 223 self.scripts = scripts 224 self.variant = variant 225 self.weight = weight 226 self.style = style 227 self.fallback_for = fallback_for 228 self.font = font 229 230 231def parse_fonts_xml(fonts_xml_path): 232 global _script_to_font_map, _fallback_chains, _all_fonts 233 _script_to_font_map = collections.defaultdict(set) 234 _fallback_chains = {} 235 _all_fonts = [] 236 tree = ElementTree.parse(fonts_xml_path) 237 families = tree.findall('family') 238 # Minikin supports up to 254 but users can place their own font at the first 239 # place. Thus, 253 is the maximum allowed number of font families in the 240 # default collection. 241 assert len(families) < 254, ( 242 'System font collection can contains up to 253 font families.') 243 for family in families: 244 name = family.get('name') 245 variant = family.get('variant') 246 langs = family.get('lang') 247 ignoreAttr = family.get('ignore') 248 249 if name: 250 assert variant is None, ( 251 'No variant expected for LGC font %s.' % name) 252 assert langs is None, ( 253 'No language expected for LGC fonts %s.' % name) 254 assert name not in _fallback_chains, 'Duplicated name entry %s' % name 255 _fallback_chains[name] = [] 256 else: 257 assert variant in {None, 'elegant', 'compact'}, ( 258 'Unexpected value for variant: %s' % variant) 259 260 trim_re = re.compile(r"^[ \n\r\t]*(.+)[ \n\r\t]*$") 261 for family in families: 262 name = family.get('name') 263 variant = family.get('variant') 264 langs = family.get('lang') 265 ignoreAttr = family.get('ignore') 266 ignore = ignoreAttr == 'true' or ignoreAttr == '1' 267 268 if ignore: 269 continue 270 271 if langs: 272 langs = langs.split() 273 scripts = {lang_to_script(lang) for lang in langs} 274 else: 275 scripts = set() 276 277 for child in family: 278 assert child.tag == 'font', ( 279 'Unknown tag <%s>' % child.tag) 280 font_file = child.text.rstrip() 281 282 m = trim_re.match(font_file) 283 font_file = m.group(1) 284 285 weight = int(child.get('weight')) 286 assert weight % 100 == 0, ( 287 'Font weight "%d" is not a multiple of 100.' % weight) 288 289 style = child.get('style') 290 assert style in {'normal', 'italic'}, ( 291 'Unknown style "%s"' % style) 292 293 fallback_for = child.get('fallbackFor') 294 295 assert not name or not fallback_for, ( 296 'name and fallbackFor cannot be present at the same time') 297 assert not fallback_for or fallback_for in _fallback_chains, ( 298 'Unknown fallback name: %s' % fallback_for) 299 300 index = child.get('index') 301 if index: 302 index = int(index) 303 304 if not path.exists(path.join(_fonts_dir, m.group(1))): 305 continue # Missing font is a valid case. Just ignore the missing font files. 306 307 record = FontRecord( 308 name, 309 child.get('postScriptName'), 310 frozenset(scripts), 311 variant, 312 weight, 313 style, 314 fallback_for, 315 (font_file, index)) 316 317 _all_fonts.append(record) 318 319 if not fallback_for: 320 if not name or name == 'sans-serif': 321 for _, fallback in _fallback_chains.items(): 322 fallback.append(record) 323 else: 324 _fallback_chains[name].append(record) 325 else: 326 _fallback_chains[fallback_for].append(record) 327 328 if name: # non-empty names are used for default LGC fonts 329 map_scripts = {'Latn', 'Grek', 'Cyrl'} 330 else: 331 map_scripts = scripts 332 for script in map_scripts: 333 _script_to_font_map[script].add((font_file, index)) 334 335 336def check_emoji_coverage(all_emoji, equivalent_emoji): 337 emoji_fonts = get_emoji_fonts() 338 check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji) 339 340 341def get_emoji_fonts(): 342 return [ record.font for record in _all_fonts if 'Zsye' in record.scripts ] 343 344def seq_any(sequence, pred): 345 if type(sequence) is tuple: 346 return any([pred(x) for x in sequence]) 347 else: 348 return pred(sequence) 349 350def seq_all(sequence, pred): 351 if type(sequence) is tuple: 352 return all([pred(x) for x in sequence]) 353 else: 354 return pred(sequence) 355 356def is_regional_indicator(x): 357 # regional indicator A..Z 358 return 0x1F1E6 <= x <= 0x1F1FF 359 360def is_tag(x): 361 # tag block 362 return 0xE0000 <= x <= 0xE007F 363 364def is_pua(x): 365 return 0xE000 <= x <= 0xF8FF or 0xF0000 <= x <= 0xFFFFD or 0x100000 <= x <= 0x10FFFD 366 367def contains_pua(sequence): 368 return seq_any(sequence, is_pua) 369 370def contains_regional_indicator(sequence): 371 return seq_any(sequence, is_regional_indicator) 372 373def only_tags(sequence): 374 return seq_all(sequence, is_tag) 375 376def get_psname(ttf): 377 return str(next(x for x in ttf['name'].names 378 if x.platformID == 3 and x.platEncID == 1 and x.nameID == 6)) 379 380def hex_strs(sequence): 381 if type(sequence) is tuple: 382 return tuple(f"{s:X}" for s in sequence) 383 return hex(sequence) 384 385def check_emoji_not_compat(all_emoji, equivalent_emoji): 386 compat_psnames = set() 387 for emoji_font in get_emoji_fonts(): 388 ttf = open_font(emoji_font) 389 psname = get_psname(ttf) 390 391 if "meta" in ttf: 392 assert 'Emji' not in ttf["meta"].data, 'NotoColorEmoji MUST be a compat font' 393 394 395def check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji): 396 coverages = [] 397 for emoji_font in emoji_fonts: 398 coverages.append(get_emoji_map(emoji_font)) 399 400 errors = [] 401 402 for sequence in all_emoji: 403 if all([sequence not in coverage for coverage in coverages]): 404 errors.append('%s is not supported in the emoji font.' % printable(sequence)) 405 406 for coverage in coverages: 407 for sequence in coverage: 408 if sequence in {0x0000, 0x000D, 0x0020}: 409 # The font needs to support a few extra characters, which is OK 410 continue 411 412 if contains_pua(sequence): 413 # The font needs to have some PUA for EmojiCompat library. 414 continue 415 416 if sequence not in all_emoji: 417 errors.append('%s support unexpected in the emoji font.' % printable(sequence)) 418 419 for first, second in equivalent_emoji.items(): 420 for coverage in coverages: 421 if first not in coverage or second not in coverage: 422 continue # sequence will be reported missing 423 if coverage[first] != coverage[second]: 424 errors.append('%s and %s should map to the same glyph.' % ( 425 printable(first), 426 printable(second))) 427 428 for coverage in coverages: 429 for glyph in set(coverage.values()): 430 maps_to_glyph = [ 431 seq for seq in coverage if coverage[seq] == glyph and not contains_pua(seq) ] 432 if len(maps_to_glyph) > 1: 433 # There are more than one sequences mapping to the same glyph. We 434 # need to make sure they were expected to be equivalent. 435 equivalent_seqs = set() 436 for seq in maps_to_glyph: 437 equivalent_seq = seq 438 while equivalent_seq in equivalent_emoji: 439 equivalent_seq = equivalent_emoji[equivalent_seq] 440 equivalent_seqs.add(equivalent_seq) 441 if len(equivalent_seqs) != 1: 442 errors.append('The sequences %s should not result in the same glyph %s' % ( 443 printable(equivalent_seqs), 444 glyph)) 445 446 assert not errors, '%d emoji font errors:\n%s\n%d emoji font coverage errors' % (len(errors), '\n'.join(errors), len(errors)) 447 448 449def check_emoji_defaults(default_emoji): 450 missing_text_chars = _emoji_properties['Emoji'] - default_emoji 451 for name, fallback_chain in _fallback_chains.items(): 452 emoji_font_seen = False 453 for record in fallback_chain: 454 if 'Zsye' in record.scripts: 455 emoji_font_seen = True 456 # No need to check the emoji font 457 continue 458 # For later fonts, we only check them if they have a script 459 # defined, since the defined script may get them to a higher 460 # score even if they appear after the emoji font. However, 461 # we should skip checking the text symbols font, since 462 # symbol fonts should be able to override the emoji display 463 # style when 'Zsym' is explicitly specified by the user. 464 if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts): 465 continue 466 467 # Check default emoji-style characters 468 assert_font_supports_none_of_chars(record.font, default_emoji, name) 469 470 # Mark default text-style characters appearing in fonts above the emoji 471 # font as seen 472 if not emoji_font_seen: 473 missing_text_chars -= set(get_best_cmap(record.font)) 474 475 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and 476 # webdings yet. 477 missing_text_chars -= _chars_by_age['7.0'] 478 assert missing_text_chars == set(), ( 479 'Text style version of some emoji characters are missing: ' + 480 repr(missing_text_chars)) 481 482 483# Setting reverse to true returns a dictionary that maps the values to sets of 484# characters, useful for some binary properties. Otherwise, we get a 485# dictionary that maps characters to the property values, assuming there's only 486# one property in the file. 487def parse_unicode_datafile(file_path, reverse=False): 488 if reverse: 489 output_dict = collections.defaultdict(set) 490 else: 491 output_dict = {} 492 with open(file_path) as datafile: 493 for line in datafile: 494 if '#' in line: 495 line = line[:line.index('#')] 496 line = line.strip() 497 if not line: 498 continue 499 500 chars, prop = line.split(';')[:2] 501 chars = chars.strip() 502 prop = prop.strip() 503 504 if ' ' in chars: # character sequence 505 sequence = [int(ch, 16) for ch in chars.split(' ')] 506 additions = [tuple(sequence)] 507 elif '..' in chars: # character range 508 char_start, char_end = chars.split('..') 509 char_start = int(char_start, 16) 510 char_end = int(char_end, 16) 511 additions = range(char_start, char_end+1) 512 else: # singe character 513 additions = [int(chars, 16)] 514 if reverse: 515 output_dict[prop].update(additions) 516 else: 517 for addition in additions: 518 assert addition not in output_dict 519 output_dict[addition] = prop 520 return output_dict 521 522 523def parse_emoji_variants(file_path): 524 emoji_set = set() 525 text_set = set() 526 with open(file_path) as datafile: 527 for line in datafile: 528 if '#' in line: 529 line = line[:line.index('#')] 530 line = line.strip() 531 if not line: 532 continue 533 sequence, description, _ = line.split(';') 534 sequence = sequence.strip().split(' ') 535 base = int(sequence[0], 16) 536 vs = int(sequence[1], 16) 537 description = description.strip() 538 if description == 'text style': 539 text_set.add((base, vs)) 540 elif description == 'emoji style': 541 emoji_set.add((base, vs)) 542 return text_set, emoji_set 543 544 545def parse_ucd(ucd_path): 546 global _emoji_properties, _chars_by_age 547 global _text_variation_sequences, _emoji_variation_sequences 548 global _emoji_sequences, _emoji_zwj_sequences 549 _emoji_properties = parse_unicode_datafile( 550 path.join(ucd_path, 'emoji-data.txt'), reverse=True) 551 emoji_properties_additions = parse_unicode_datafile( 552 path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True) 553 for prop in emoji_properties_additions.keys(): 554 _emoji_properties[prop].update(emoji_properties_additions[prop]) 555 556 _chars_by_age = parse_unicode_datafile( 557 path.join(ucd_path, 'DerivedAge.txt'), reverse=True) 558 sequences = parse_emoji_variants( 559 path.join(ucd_path, 'emoji-variation-sequences.txt')) 560 _text_variation_sequences, _emoji_variation_sequences = sequences 561 _emoji_sequences = parse_unicode_datafile( 562 path.join(ucd_path, 'emoji-sequences.txt')) 563 _emoji_sequences.update(parse_unicode_datafile( 564 path.join(ucd_path, 'additions', 'emoji-sequences.txt'))) 565 _emoji_zwj_sequences = parse_unicode_datafile( 566 path.join(ucd_path, 'emoji-zwj-sequences.txt')) 567 _emoji_zwj_sequences.update(parse_unicode_datafile( 568 path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt'))) 569 570 exclusions = parse_unicode_datafile(path.join(ucd_path, 'additions', 'emoji-exclusions.txt')) 571 _emoji_sequences = remove_emoji_exclude(_emoji_sequences, exclusions) 572 _emoji_zwj_sequences = remove_emoji_exclude(_emoji_zwj_sequences, exclusions) 573 _emoji_variation_sequences = remove_emoji_variation_exclude(_emoji_variation_sequences, exclusions) 574 # Unicode 12.0 adds Basic_Emoji in emoji-sequences.txt. We ignore them here since we are already 575 # checking the emoji presentations with emoji-variation-sequences.txt. 576 # Please refer to http://unicode.org/reports/tr51/#def_basic_emoji_set . 577 _emoji_sequences = {k: v for k, v in _emoji_sequences.items() if not v == 'Basic_Emoji' } 578 579 580def remove_emoji_variation_exclude(source, items): 581 return source.difference(items.keys()) 582 583def remove_emoji_exclude(source, items): 584 return {k: v for k, v in source.items() if k not in items} 585 586def flag_sequence(territory_code): 587 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code) 588 589EQUIVALENT_FLAGS = { 590 flag_sequence('BV'): flag_sequence('NO'), 591 flag_sequence('CP'): flag_sequence('FR'), 592 flag_sequence('HM'): flag_sequence('AU'), 593 flag_sequence('SJ'): flag_sequence('NO'), 594 flag_sequence('UM'): flag_sequence('US'), 595} 596 597COMBINING_KEYCAP = 0x20E3 598 599LEGACY_ANDROID_EMOJI = { 600 0xFE4E5: flag_sequence('JP'), 601 0xFE4E6: flag_sequence('US'), 602 0xFE4E7: flag_sequence('FR'), 603 0xFE4E8: flag_sequence('DE'), 604 0xFE4E9: flag_sequence('IT'), 605 0xFE4EA: flag_sequence('GB'), 606 0xFE4EB: flag_sequence('ES'), 607 0xFE4EC: flag_sequence('RU'), 608 0xFE4ED: flag_sequence('CN'), 609 0xFE4EE: flag_sequence('KR'), 610 0xFE82C: (ord('#'), COMBINING_KEYCAP), 611 0xFE82E: (ord('1'), COMBINING_KEYCAP), 612 0xFE82F: (ord('2'), COMBINING_KEYCAP), 613 0xFE830: (ord('3'), COMBINING_KEYCAP), 614 0xFE831: (ord('4'), COMBINING_KEYCAP), 615 0xFE832: (ord('5'), COMBINING_KEYCAP), 616 0xFE833: (ord('6'), COMBINING_KEYCAP), 617 0xFE834: (ord('7'), COMBINING_KEYCAP), 618 0xFE835: (ord('8'), COMBINING_KEYCAP), 619 0xFE836: (ord('9'), COMBINING_KEYCAP), 620 0xFE837: (ord('0'), COMBINING_KEYCAP), 621} 622 623# This is used to define the emoji that should have the same glyph. 624# i.e. previously we had gender based Kiss (0x1F48F), which had the same glyph 625# with Kiss: Woman, Man (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468) 626# in that case a valid row would be: 627# (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F, 628ZWJ_IDENTICALS = { 629} 630 631SAME_FLAG_MAPPINGS = [ 632 # Diego Garcia and British Indian Ocean Territory 633 ((0x1F1EE, 0x1F1F4), (0x1F1E9, 0x1F1EC)), 634 # St. Martin and France 635 ((0x1F1F2, 0x1F1EB), (0x1F1EB, 0x1F1F7)), 636 # Spain and Ceuta & Melilla 637 ((0x1F1EA, 0x1F1F8), (0x1F1EA, 0x1F1E6)), 638] 639 640ZWJ = 0x200D 641 642EMPTY_FLAG_SEQUENCE = (0x1F3F4, 0xE007F) 643 644def is_fitzpatrick_modifier(cp): 645 return 0x1F3FB <= cp <= 0x1F3FF 646 647 648def reverse_emoji(seq): 649 rev = list(reversed(seq)) 650 # if there are fitzpatrick modifiers in the sequence, keep them after 651 # the emoji they modify 652 for i in range(1, len(rev)): 653 if is_fitzpatrick_modifier(rev[i-1]): 654 rev[i], rev[i-1] = rev[i-1], rev[i] 655 return tuple(rev) 656 657 658def compute_expected_emoji(): 659 equivalent_emoji = {} 660 sequence_pieces = set() 661 all_sequences = set() 662 all_sequences.update(_emoji_variation_sequences) 663 664 # add zwj sequences not in the current emoji-zwj-sequences.txt 665 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences) 666 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences) 667 668 # Add empty flag tag sequence that is supported as fallback 669 _emoji_sequences[EMPTY_FLAG_SEQUENCE] = 'Emoji_Tag_Sequence' 670 671 for sequence in _emoji_sequences.keys(): 672 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 673 all_sequences.add(sequence) 674 sequence_pieces.update(sequence) 675 676 for sequence in adjusted_emoji_zwj_sequences.keys(): 677 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 678 all_sequences.add(sequence) 679 sequence_pieces.update(sequence) 680 681 for first, second in SAME_FLAG_MAPPINGS: 682 equivalent_emoji[first] = second 683 684 # Add all tag characters used in flags 685 sequence_pieces.update(range(0xE0030, 0xE0039 + 1)) 686 sequence_pieces.update(range(0xE0061, 0xE007A + 1)) 687 688 all_emoji = ( 689 _emoji_properties['Emoji'] | 690 all_sequences | 691 sequence_pieces | 692 set(LEGACY_ANDROID_EMOJI.keys())) 693 default_emoji = ( 694 _emoji_properties['Emoji_Presentation'] | 695 all_sequences | 696 set(LEGACY_ANDROID_EMOJI.keys())) 697 698 equivalent_emoji.update(EQUIVALENT_FLAGS) 699 equivalent_emoji.update(LEGACY_ANDROID_EMOJI) 700 equivalent_emoji.update(ZWJ_IDENTICALS) 701 702 for seq in _emoji_variation_sequences: 703 equivalent_emoji[seq] = seq[0] 704 705 return all_emoji, default_emoji, equivalent_emoji 706 707 708def check_compact_only_fallback(): 709 for name, fallback_chain in _fallback_chains.items(): 710 for record in fallback_chain: 711 if record.variant == 'compact': 712 same_script_elegants = [x for x in fallback_chain 713 if x.scripts == record.scripts and x.variant == 'elegant'] 714 assert same_script_elegants, ( 715 '%s must be in elegant of %s as fallback of "%s" too' % ( 716 record.font, record.scripts, record.fallback_for),) 717 718 719def check_vertical_metrics(): 720 for record in _all_fonts: 721 if record.name in ['sans-serif', 'sans-serif-condensed']: 722 font = open_font(record.font) 723 assert font['head'].yMax == 2163 and font['head'].yMin == -555, ( 724 'yMax and yMin of %s do not match expected values.' % ( 725 record.font,)) 726 727 if record.name in ['sans-serif', 'sans-serif-condensed', 728 'serif', 'monospace']: 729 font = open_font(record.font) 730 assert (font['hhea'].ascent == 1900 and 731 font['hhea'].descent == -500), ( 732 'ascent and descent of %s do not match expected ' 733 'values.' % (record.font,)) 734 735 736def check_cjk_punctuation(): 737 cjk_scripts = {'Hans', 'Hant', 'Jpan', 'Kore'} 738 cjk_punctuation = range(0x3000, 0x301F + 1) 739 for name, fallback_chain in _fallback_chains.items(): 740 for record in fallback_chain: 741 if record.scripts.intersection(cjk_scripts): 742 # CJK font seen. Stop checking the rest of the fonts. 743 break 744 assert_font_supports_none_of_chars(record.font, cjk_punctuation, name) 745 746def getPostScriptName(font): 747 font_file, index = font 748 font_path = path.join(_fonts_dir, font_file) 749 if index is not None: 750 # Use the first font file in the collection for resolving post script name. 751 ttf = ttLib.TTFont(font_path, fontNumber=0) 752 else: 753 ttf = ttLib.TTFont(font_path) 754 755 nameTable = ttf['name'] 756 for name in nameTable.names: 757 if (name.nameID == 6 and name.platformID == 3 and name.platEncID == 1 758 and name.langID == 0x0409): 759 return str(name) 760 761def check_canonical_name(): 762 for record in _all_fonts: 763 file_name, index = record.font 764 765 psName = getPostScriptName(record.font) 766 if record.psName: 767 # If fonts element has postScriptName attribute, it should match with the PostScript 768 # name in the name table. 769 assert psName == record.psName, ('postScriptName attribute %s should match with %s' % ( 770 record.psName, psName)) 771 else: 772 # If fonts element doesn't have postScriptName attribute, the file name should match 773 # with the PostScript name in the name table. 774 assert psName == file_name[:-4], ('file name %s should match with %s' % ( 775 file_name, psName)) 776 777 778def main(): 779 global _fonts_dir 780 target_out = sys.argv[1] 781 _fonts_dir = path.join(target_out, 'fonts') 782 783 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml') 784 785 parse_fonts_xml(fonts_xml_path) 786 787 check_compact_only_fallback() 788 789 check_vertical_metrics() 790 791 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data') 792 check_hyphens(hyphens_dir) 793 794 check_cjk_punctuation() 795 796 check_canonical_name() 797 798 check_emoji = sys.argv[2] 799 if check_emoji == 'true': 800 ucd_path = sys.argv[3] 801 parse_ucd(ucd_path) 802 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() 803 check_emoji_not_compat(all_emoji, equivalent_emoji) 804 check_emoji_coverage(all_emoji, equivalent_emoji) 805 check_emoji_defaults(default_emoji) 806 807 808if __name__ == '__main__': 809 main() 810