1#!/usr/bin/env python
2
3import collections
4import copy
5import glob
6from os import path
7import re
8import sys
9from xml.etree import ElementTree
10
11from fontTools import ttLib
12
13EMOJI_VS = 0xFE0F
14
15LANG_TO_SCRIPT = {
16    'af': 'Latn',
17    'as': 'Beng',
18    'am': 'Latn',
19    'be': 'Cyrl',
20    'bg': 'Cyrl',
21    'bn': 'Beng',
22    'cs': 'Latn',
23    'cu': 'Cyrl',
24    'cy': 'Latn',
25    'da': 'Latn',
26    'de': 'Latn',
27    'el': 'Latn',
28    'en': 'Latn',
29    'es': 'Latn',
30    'et': 'Latn',
31    'eu': 'Latn',
32    'fr': 'Latn',
33    'ga': 'Latn',
34    'gl': 'Latn',
35    'gu': 'Gujr',
36    'hi': 'Deva',
37    'hr': 'Latn',
38    'hu': 'Latn',
39    'hy': 'Armn',
40    'it': 'Latn',
41    'ja': 'Jpan',
42    'ka': 'Latn',
43    'kn': 'Knda',
44    'ko': 'Kore',
45    'la': 'Latn',
46    'lt': 'Latn',
47    'lv': 'Latn',
48    'ml': 'Mlym',
49    'mn': 'Cyrl',
50    'mr': 'Deva',
51    'nb': 'Latn',
52    'nl': 'Latn',
53    'nn': 'Latn',
54    'or': 'Orya',
55    'pa': 'Guru',
56    'pt': 'Latn',
57    'pl': 'Latn',
58    'ru': 'Latn',
59    'sk': 'Latn',
60    'sl': 'Latn',
61    'sq': 'Latn',
62    'sv': 'Latn',
63    'ta': 'Taml',
64    'te': 'Telu',
65    'tk': 'Latn',
66    'uk': 'Latn',
67}
68
69def lang_to_script(lang_code):
70    lang = lang_code.lower()
71    while lang not in LANG_TO_SCRIPT:
72        hyphen_idx = lang.rfind('-')
73        assert hyphen_idx != -1, (
74            'We do not know what script the "%s" language is written in.'
75            % lang_code)
76        assumed_script = lang[hyphen_idx+1:]
77        if len(assumed_script) == 4 and assumed_script.isalpha():
78            # This is actually the script
79            return assumed_script.title()
80        lang = lang[:hyphen_idx]
81    return LANG_TO_SCRIPT[lang]
82
83
84def printable(inp):
85    if type(inp) is set:  # set of character sequences
86        return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
87    if type(inp) is tuple:  # character sequence
88        return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
89    else:  # single character
90        return 'U+%04X' % inp
91
92
93def open_font(font):
94    font_file, index = font
95    font_path = path.join(_fonts_dir, font_file)
96    if index is not None:
97        return ttLib.TTFont(font_path, fontNumber=index)
98    else:
99        return ttLib.TTFont(font_path)
100
101
102def get_best_cmap(font):
103    ttfont = open_font(font)
104    all_unicode_cmap = None
105    bmp_cmap = None
106    for cmap in ttfont['cmap'].tables:
107        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
108        if specifier == (4, 3, 1):
109            assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
110            bmp_cmap = cmap
111        elif specifier == (12, 3, 10):
112            assert all_unicode_cmap is None, (
113                'More than one UCS-4 cmap in %s' % (font, ))
114            all_unicode_cmap = cmap
115
116    return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
117
118
119def get_variation_sequences_cmap(font):
120    ttfont = open_font(font)
121    vs_cmap = None
122    for cmap in ttfont['cmap'].tables:
123        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
124        if specifier == (14, 0, 5):
125            assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
126            vs_cmap = cmap
127    return vs_cmap
128
129
130def get_emoji_map(font):
131    # Add normal characters
132    emoji_map = copy.copy(get_best_cmap(font))
133    reverse_cmap = {glyph: code for code, glyph in emoji_map.items() if not contains_pua(code) }
134
135    # Add variation sequences
136    vs_cmap = get_variation_sequences_cmap(font)
137    if vs_cmap:
138        for vs in vs_cmap.uvsDict:
139            for base, glyph in vs_cmap.uvsDict[vs]:
140                if glyph is None:
141                    emoji_map[(base, vs)] = emoji_map[base]
142                else:
143                    emoji_map[(base, vs)] = glyph
144
145    # Add GSUB rules
146    ttfont = open_font(font)
147    for lookup in ttfont['GSUB'].table.LookupList.Lookup:
148        if lookup.LookupType != 4:
149            # Other lookups are used in the emoji font for fallback.
150            # We ignore them for now.
151            continue
152        for subtable in lookup.SubTable:
153            ligatures = subtable.ligatures
154            for first_glyph in ligatures:
155                for ligature in ligatures[first_glyph]:
156                    sequence = [first_glyph] + ligature.Component
157                    sequence = [reverse_cmap[glyph] for glyph in sequence]
158                    sequence = tuple(sequence)
159                    # Make sure no starting subsequence of 'sequence' has been
160                    # seen before.
161                    for sub_len in range(2, len(sequence)+1):
162                        subsequence = sequence[:sub_len]
163                        assert subsequence not in emoji_map
164                    emoji_map[sequence] = ligature.LigGlyph
165
166    return emoji_map
167
168
169def assert_font_supports_any_of_chars(font, chars):
170    best_cmap = get_best_cmap(font)
171    for char in chars:
172        if char in best_cmap:
173            return
174    sys.exit('None of characters in %s were found in %s' % (chars, font))
175
176
177def assert_font_supports_all_of_chars(font, chars):
178    best_cmap = get_best_cmap(font)
179    for char in chars:
180        assert char in best_cmap, (
181            'U+%04X was not found in %s' % (char, font))
182
183
184def assert_font_supports_none_of_chars(font, chars, fallbackName):
185    best_cmap = get_best_cmap(font)
186    for char in chars:
187        if fallbackName:
188            assert char not in best_cmap, 'U+%04X was found in %s' % (char, font)
189        else:
190            assert char not in best_cmap, (
191                'U+%04X was found in %s in fallback %s' % (char, font, fallbackName))
192
193
194def assert_font_supports_all_sequences(font, sequences):
195    vs_dict = get_variation_sequences_cmap(font).uvsDict
196    for base, vs in sorted(sequences):
197        assert vs in vs_dict and (base, None) in vs_dict[vs], (
198            '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
199
200
201def check_hyphens(hyphens_dir):
202    # Find all the scripts that need automatic hyphenation
203    scripts = set()
204    for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
205        hyb_file = path.basename(hyb_file)
206        assert hyb_file.startswith('hyph-'), (
207            'Unknown hyphenation file %s' % hyb_file)
208        lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
209        scripts.add(lang_to_script(lang_code))
210
211    HYPHENS = {0x002D, 0x2010}
212    for script in scripts:
213        fonts = _script_to_font_map[script]
214        assert fonts, 'No fonts found for the "%s" script' % script
215        for font in fonts:
216            assert_font_supports_any_of_chars(font, HYPHENS)
217
218
219class FontRecord(object):
220    def __init__(self, name, psName, scripts, variant, weight, style, fallback_for, font):
221        self.name = name
222        self.psName = psName
223        self.scripts = scripts
224        self.variant = variant
225        self.weight = weight
226        self.style = style
227        self.fallback_for = fallback_for
228        self.font = font
229
230
231def parse_fonts_xml(fonts_xml_path):
232    global _script_to_font_map, _fallback_chains, _all_fonts
233    _script_to_font_map = collections.defaultdict(set)
234    _fallback_chains = {}
235    _all_fonts = []
236    tree = ElementTree.parse(fonts_xml_path)
237    families = tree.findall('family')
238    # Minikin supports up to 254 but users can place their own font at the first
239    # place. Thus, 253 is the maximum allowed number of font families in the
240    # default collection.
241    assert len(families) < 254, (
242        'System font collection can contains up to 253 font families.')
243    for family in families:
244        name = family.get('name')
245        variant = family.get('variant')
246        langs = family.get('lang')
247        ignoreAttr = family.get('ignore')
248
249        if name:
250            assert variant is None, (
251                'No variant expected for LGC font %s.' % name)
252            assert langs is None, (
253                'No language expected for LGC fonts %s.' % name)
254            assert name not in _fallback_chains, 'Duplicated name entry %s' % name
255            _fallback_chains[name] = []
256        else:
257            assert variant in {None, 'elegant', 'compact'}, (
258                'Unexpected value for variant: %s' % variant)
259
260    trim_re = re.compile(r"^[ \n\r\t]*(.+)[ \n\r\t]*$")
261    for family in families:
262        name = family.get('name')
263        variant = family.get('variant')
264        langs = family.get('lang')
265        ignoreAttr = family.get('ignore')
266        ignore = ignoreAttr == 'true' or ignoreAttr == '1'
267
268        if ignore:
269            continue
270
271        if langs:
272            langs = langs.split()
273            scripts = {lang_to_script(lang) for lang in langs}
274        else:
275            scripts = set()
276
277        for child in family:
278            assert child.tag == 'font', (
279                'Unknown tag <%s>' % child.tag)
280            font_file = child.text.rstrip()
281
282            m = trim_re.match(font_file)
283            font_file = m.group(1)
284
285            weight = int(child.get('weight'))
286            assert weight % 100 == 0, (
287                'Font weight "%d" is not a multiple of 100.' % weight)
288
289            style = child.get('style')
290            assert style in {'normal', 'italic'}, (
291                'Unknown style "%s"' % style)
292
293            fallback_for = child.get('fallbackFor')
294
295            assert not name or not fallback_for, (
296                'name and fallbackFor cannot be present at the same time')
297            assert not fallback_for or fallback_for in _fallback_chains, (
298                'Unknown fallback name: %s' % fallback_for)
299
300            index = child.get('index')
301            if index:
302                index = int(index)
303
304            if not path.exists(path.join(_fonts_dir, m.group(1))):
305                continue # Missing font is a valid case. Just ignore the missing font files.
306
307            record = FontRecord(
308                name,
309                child.get('postScriptName'),
310                frozenset(scripts),
311                variant,
312                weight,
313                style,
314                fallback_for,
315                (font_file, index))
316
317            _all_fonts.append(record)
318
319            if not fallback_for:
320                if not name or name == 'sans-serif':
321                    for _, fallback in _fallback_chains.items():
322                        fallback.append(record)
323                else:
324                    _fallback_chains[name].append(record)
325            else:
326                _fallback_chains[fallback_for].append(record)
327
328            if name: # non-empty names are used for default LGC fonts
329                map_scripts = {'Latn', 'Grek', 'Cyrl'}
330            else:
331                map_scripts = scripts
332            for script in map_scripts:
333                _script_to_font_map[script].add((font_file, index))
334
335
336def check_emoji_coverage(all_emoji, equivalent_emoji):
337    emoji_fonts = get_emoji_fonts()
338    check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji)
339
340
341def get_emoji_fonts():
342    return [ record.font for record in _all_fonts if 'Zsye' in record.scripts ]
343
344def seq_any(sequence, pred):
345  if type(sequence) is tuple:
346    return any([pred(x) for x in sequence])
347  else:
348    return pred(sequence)
349
350def seq_all(sequence, pred):
351  if type(sequence) is tuple:
352    return all([pred(x) for x in sequence])
353  else:
354    return pred(sequence)
355
356def is_regional_indicator(x):
357    # regional indicator A..Z
358    return 0x1F1E6 <= x <= 0x1F1FF
359
360def is_tag(x):
361    # tag block
362    return 0xE0000 <= x <= 0xE007F
363
364def is_pua(x):
365    return 0xE000 <= x <= 0xF8FF or 0xF0000 <= x <= 0xFFFFD or 0x100000 <= x <= 0x10FFFD
366
367def contains_pua(sequence):
368    return seq_any(sequence, is_pua)
369
370def contains_regional_indicator(sequence):
371    return seq_any(sequence, is_regional_indicator)
372
373def only_tags(sequence):
374    return seq_all(sequence, is_tag)
375
376def get_psname(ttf):
377    return str(next(x for x in ttf['name'].names
378        if x.platformID == 3 and x.platEncID == 1 and x.nameID == 6))
379
380def hex_strs(sequence):
381    if type(sequence) is tuple:
382        return tuple(f"{s:X}" for s in sequence)
383    return hex(sequence)
384
385def check_emoji_not_compat(all_emoji, equivalent_emoji):
386    compat_psnames = set()
387    for emoji_font in get_emoji_fonts():
388        ttf = open_font(emoji_font)
389        psname = get_psname(ttf)
390
391        if "meta" in ttf:
392            assert 'Emji' not in ttf["meta"].data, 'NotoColorEmoji MUST be a compat font'
393
394
395def check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji):
396    coverages = []
397    for emoji_font in emoji_fonts:
398        coverages.append(get_emoji_map(emoji_font))
399
400    errors = []
401
402    for sequence in all_emoji:
403        if all([sequence not in coverage for coverage in coverages]):
404            errors.append('%s is not supported in the emoji font.' % printable(sequence))
405
406    for coverage in coverages:
407        for sequence in coverage:
408            if sequence in {0x0000, 0x000D, 0x0020}:
409                # The font needs to support a few extra characters, which is OK
410                continue
411
412            if contains_pua(sequence):
413                # The font needs to have some PUA for EmojiCompat library.
414                continue
415
416            if sequence not in all_emoji:
417                errors.append('%s support unexpected in the emoji font.' % printable(sequence))
418
419    for first, second in equivalent_emoji.items():
420        for coverage in coverages:
421            if first not in coverage or second not in coverage:
422                continue  # sequence will be reported missing
423            if coverage[first] != coverage[second]:
424                errors.append('%s and %s should map to the same glyph.' % (
425                    printable(first),
426                    printable(second)))
427
428    for coverage in coverages:
429        for glyph in set(coverage.values()):
430            maps_to_glyph = [
431                seq for seq in coverage if coverage[seq] == glyph and not contains_pua(seq) ]
432            if len(maps_to_glyph) > 1:
433                # There are more than one sequences mapping to the same glyph. We
434                # need to make sure they were expected to be equivalent.
435                equivalent_seqs = set()
436                for seq in maps_to_glyph:
437                    equivalent_seq = seq
438                    while equivalent_seq in equivalent_emoji:
439                        equivalent_seq = equivalent_emoji[equivalent_seq]
440                    equivalent_seqs.add(equivalent_seq)
441                if len(equivalent_seqs) != 1:
442                    errors.append('The sequences %s should not result in the same glyph %s' % (
443                        printable(equivalent_seqs),
444                        glyph))
445
446    assert not errors, '%d emoji font errors:\n%s\n%d emoji font coverage errors' % (len(errors), '\n'.join(errors), len(errors))
447
448
449def check_emoji_defaults(default_emoji):
450    missing_text_chars = _emoji_properties['Emoji'] - default_emoji
451    for name, fallback_chain in _fallback_chains.items():
452        emoji_font_seen = False
453        for record in fallback_chain:
454            if 'Zsye' in record.scripts:
455                emoji_font_seen = True
456                # No need to check the emoji font
457                continue
458            # For later fonts, we only check them if they have a script
459            # defined, since the defined script may get them to a higher
460            # score even if they appear after the emoji font. However,
461            # we should skip checking the text symbols font, since
462            # symbol fonts should be able to override the emoji display
463            # style when 'Zsym' is explicitly specified by the user.
464            if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts):
465                continue
466
467            # Check default emoji-style characters
468            assert_font_supports_none_of_chars(record.font, default_emoji, name)
469
470            # Mark default text-style characters appearing in fonts above the emoji
471            # font as seen
472            if not emoji_font_seen:
473                missing_text_chars -= set(get_best_cmap(record.font))
474
475        # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
476        # webdings yet.
477        missing_text_chars -= _chars_by_age['7.0']
478        assert missing_text_chars == set(), (
479            'Text style version of some emoji characters are missing: ' +
480                repr(missing_text_chars))
481
482
483# Setting reverse to true returns a dictionary that maps the values to sets of
484# characters, useful for some binary properties. Otherwise, we get a
485# dictionary that maps characters to the property values, assuming there's only
486# one property in the file.
487def parse_unicode_datafile(file_path, reverse=False):
488    if reverse:
489        output_dict = collections.defaultdict(set)
490    else:
491        output_dict = {}
492    with open(file_path) as datafile:
493        for line in datafile:
494            if '#' in line:
495                line = line[:line.index('#')]
496            line = line.strip()
497            if not line:
498                continue
499
500            chars, prop = line.split(';')[:2]
501            chars = chars.strip()
502            prop = prop.strip()
503
504            if ' ' in chars:  # character sequence
505                sequence = [int(ch, 16) for ch in chars.split(' ')]
506                additions = [tuple(sequence)]
507            elif '..' in chars:  # character range
508                char_start, char_end = chars.split('..')
509                char_start = int(char_start, 16)
510                char_end = int(char_end, 16)
511                additions = range(char_start, char_end+1)
512            else:  # singe character
513                additions = [int(chars, 16)]
514            if reverse:
515                output_dict[prop].update(additions)
516            else:
517                for addition in additions:
518                    assert addition not in output_dict
519                    output_dict[addition] = prop
520    return output_dict
521
522
523def parse_emoji_variants(file_path):
524    emoji_set = set()
525    text_set = set()
526    with open(file_path) as datafile:
527        for line in datafile:
528            if '#' in line:
529                line = line[:line.index('#')]
530            line = line.strip()
531            if not line:
532                continue
533            sequence, description, _ = line.split(';')
534            sequence = sequence.strip().split(' ')
535            base = int(sequence[0], 16)
536            vs = int(sequence[1], 16)
537            description = description.strip()
538            if description == 'text style':
539                text_set.add((base, vs))
540            elif description == 'emoji style':
541                emoji_set.add((base, vs))
542    return text_set, emoji_set
543
544
545def parse_ucd(ucd_path):
546    global _emoji_properties, _chars_by_age
547    global _text_variation_sequences, _emoji_variation_sequences
548    global _emoji_sequences, _emoji_zwj_sequences
549    _emoji_properties = parse_unicode_datafile(
550        path.join(ucd_path, 'emoji-data.txt'), reverse=True)
551    emoji_properties_additions = parse_unicode_datafile(
552        path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True)
553    for prop in emoji_properties_additions.keys():
554        _emoji_properties[prop].update(emoji_properties_additions[prop])
555
556    _chars_by_age = parse_unicode_datafile(
557        path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
558    sequences = parse_emoji_variants(
559        path.join(ucd_path, 'emoji-variation-sequences.txt'))
560    _text_variation_sequences, _emoji_variation_sequences = sequences
561    _emoji_sequences = parse_unicode_datafile(
562        path.join(ucd_path, 'emoji-sequences.txt'))
563    _emoji_sequences.update(parse_unicode_datafile(
564        path.join(ucd_path, 'additions', 'emoji-sequences.txt')))
565    _emoji_zwj_sequences = parse_unicode_datafile(
566        path.join(ucd_path, 'emoji-zwj-sequences.txt'))
567    _emoji_zwj_sequences.update(parse_unicode_datafile(
568        path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt')))
569
570    exclusions = parse_unicode_datafile(path.join(ucd_path, 'additions', 'emoji-exclusions.txt'))
571    _emoji_sequences = remove_emoji_exclude(_emoji_sequences, exclusions)
572    _emoji_zwj_sequences = remove_emoji_exclude(_emoji_zwj_sequences, exclusions)
573    _emoji_variation_sequences = remove_emoji_variation_exclude(_emoji_variation_sequences, exclusions)
574    # Unicode 12.0 adds Basic_Emoji in emoji-sequences.txt. We ignore them here since we are already
575    # checking the emoji presentations with emoji-variation-sequences.txt.
576    # Please refer to http://unicode.org/reports/tr51/#def_basic_emoji_set .
577    _emoji_sequences = {k: v for k, v in _emoji_sequences.items() if not v == 'Basic_Emoji' }
578
579
580def remove_emoji_variation_exclude(source, items):
581    return source.difference(items.keys())
582
583def remove_emoji_exclude(source, items):
584    return {k: v for k, v in source.items() if k not in items}
585
586def flag_sequence(territory_code):
587    return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
588
589EQUIVALENT_FLAGS = {
590    flag_sequence('BV'): flag_sequence('NO'),
591    flag_sequence('CP'): flag_sequence('FR'),
592    flag_sequence('HM'): flag_sequence('AU'),
593    flag_sequence('SJ'): flag_sequence('NO'),
594    flag_sequence('UM'): flag_sequence('US'),
595}
596
597COMBINING_KEYCAP = 0x20E3
598
599LEGACY_ANDROID_EMOJI = {
600    0xFE4E5: flag_sequence('JP'),
601    0xFE4E6: flag_sequence('US'),
602    0xFE4E7: flag_sequence('FR'),
603    0xFE4E8: flag_sequence('DE'),
604    0xFE4E9: flag_sequence('IT'),
605    0xFE4EA: flag_sequence('GB'),
606    0xFE4EB: flag_sequence('ES'),
607    0xFE4EC: flag_sequence('RU'),
608    0xFE4ED: flag_sequence('CN'),
609    0xFE4EE: flag_sequence('KR'),
610    0xFE82C: (ord('#'), COMBINING_KEYCAP),
611    0xFE82E: (ord('1'), COMBINING_KEYCAP),
612    0xFE82F: (ord('2'), COMBINING_KEYCAP),
613    0xFE830: (ord('3'), COMBINING_KEYCAP),
614    0xFE831: (ord('4'), COMBINING_KEYCAP),
615    0xFE832: (ord('5'), COMBINING_KEYCAP),
616    0xFE833: (ord('6'), COMBINING_KEYCAP),
617    0xFE834: (ord('7'), COMBINING_KEYCAP),
618    0xFE835: (ord('8'), COMBINING_KEYCAP),
619    0xFE836: (ord('9'), COMBINING_KEYCAP),
620    0xFE837: (ord('0'), COMBINING_KEYCAP),
621}
622
623# This is used to define the emoji that should have the same glyph.
624# i.e. previously we had gender based Kiss (0x1F48F), which had the same glyph
625# with Kiss: Woman, Man (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468)
626# in that case a valid row would be:
627# (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
628ZWJ_IDENTICALS = {
629}
630
631SAME_FLAG_MAPPINGS = [
632    # Diego Garcia and British Indian Ocean Territory
633    ((0x1F1EE, 0x1F1F4), (0x1F1E9, 0x1F1EC)),
634    # St. Martin and France
635    ((0x1F1F2, 0x1F1EB), (0x1F1EB, 0x1F1F7)),
636    # Spain and Ceuta & Melilla
637    ((0x1F1EA, 0x1F1F8), (0x1F1EA, 0x1F1E6)),
638]
639
640ZWJ = 0x200D
641
642EMPTY_FLAG_SEQUENCE = (0x1F3F4, 0xE007F)
643
644def is_fitzpatrick_modifier(cp):
645    return 0x1F3FB <= cp <= 0x1F3FF
646
647
648def reverse_emoji(seq):
649    rev = list(reversed(seq))
650    # if there are fitzpatrick modifiers in the sequence, keep them after
651    # the emoji they modify
652    for i in range(1, len(rev)):
653        if is_fitzpatrick_modifier(rev[i-1]):
654            rev[i], rev[i-1] = rev[i-1], rev[i]
655    return tuple(rev)
656
657
658def compute_expected_emoji():
659    equivalent_emoji = {}
660    sequence_pieces = set()
661    all_sequences = set()
662    all_sequences.update(_emoji_variation_sequences)
663
664    # add zwj sequences not in the current emoji-zwj-sequences.txt
665    adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
666    adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
667
668    # Add empty flag tag sequence that is supported as fallback
669    _emoji_sequences[EMPTY_FLAG_SEQUENCE] = 'Emoji_Tag_Sequence'
670
671    for sequence in _emoji_sequences.keys():
672        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
673        all_sequences.add(sequence)
674        sequence_pieces.update(sequence)
675
676    for sequence in adjusted_emoji_zwj_sequences.keys():
677        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
678        all_sequences.add(sequence)
679        sequence_pieces.update(sequence)
680
681    for first, second in SAME_FLAG_MAPPINGS:
682        equivalent_emoji[first] = second
683
684    # Add all tag characters used in flags
685    sequence_pieces.update(range(0xE0030, 0xE0039 + 1))
686    sequence_pieces.update(range(0xE0061, 0xE007A + 1))
687
688    all_emoji = (
689        _emoji_properties['Emoji'] |
690        all_sequences |
691        sequence_pieces |
692        set(LEGACY_ANDROID_EMOJI.keys()))
693    default_emoji = (
694        _emoji_properties['Emoji_Presentation'] |
695        all_sequences |
696        set(LEGACY_ANDROID_EMOJI.keys()))
697
698    equivalent_emoji.update(EQUIVALENT_FLAGS)
699    equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
700    equivalent_emoji.update(ZWJ_IDENTICALS)
701
702    for seq in _emoji_variation_sequences:
703        equivalent_emoji[seq] = seq[0]
704
705    return all_emoji, default_emoji, equivalent_emoji
706
707
708def check_compact_only_fallback():
709    for name, fallback_chain in _fallback_chains.items():
710        for record in fallback_chain:
711            if record.variant == 'compact':
712                same_script_elegants = [x for x in fallback_chain
713                    if x.scripts == record.scripts and x.variant == 'elegant']
714                assert same_script_elegants, (
715                    '%s must be in elegant of %s as fallback of "%s" too' % (
716                    record.font, record.scripts, record.fallback_for),)
717
718
719def check_vertical_metrics():
720    for record in _all_fonts:
721        if record.name in ['sans-serif', 'sans-serif-condensed']:
722            font = open_font(record.font)
723            assert font['head'].yMax == 2163 and font['head'].yMin == -555, (
724                'yMax and yMin of %s do not match expected values.' % (
725                record.font,))
726
727        if record.name in ['sans-serif', 'sans-serif-condensed',
728                           'serif', 'monospace']:
729            font = open_font(record.font)
730            assert (font['hhea'].ascent == 1900 and
731                    font['hhea'].descent == -500), (
732                        'ascent and descent of %s do not match expected '
733                        'values.' % (record.font,))
734
735
736def check_cjk_punctuation():
737    cjk_scripts = {'Hans', 'Hant', 'Jpan', 'Kore'}
738    cjk_punctuation = range(0x3000, 0x301F + 1)
739    for name, fallback_chain in _fallback_chains.items():
740        for record in fallback_chain:
741            if record.scripts.intersection(cjk_scripts):
742                # CJK font seen. Stop checking the rest of the fonts.
743                break
744            assert_font_supports_none_of_chars(record.font, cjk_punctuation, name)
745
746def getPostScriptName(font):
747  font_file, index = font
748  font_path = path.join(_fonts_dir, font_file)
749  if index is not None:
750      # Use the first font file in the collection for resolving post script name.
751      ttf = ttLib.TTFont(font_path, fontNumber=0)
752  else:
753      ttf = ttLib.TTFont(font_path)
754
755  nameTable = ttf['name']
756  for name in nameTable.names:
757      if (name.nameID == 6 and name.platformID == 3 and name.platEncID == 1
758          and name.langID == 0x0409):
759          return str(name)
760
761def check_canonical_name():
762    for record in _all_fonts:
763        file_name, index = record.font
764
765        psName = getPostScriptName(record.font)
766        if record.psName:
767            # If fonts element has postScriptName attribute, it should match with the PostScript
768            # name in the name table.
769            assert psName == record.psName, ('postScriptName attribute %s should match with %s' % (
770                record.psName, psName))
771        else:
772            # If fonts element doesn't have postScriptName attribute, the file name should match
773            # with the PostScript name in the name table.
774            assert psName == file_name[:-4], ('file name %s should match with %s' % (
775                file_name, psName))
776
777
778def main():
779    global _fonts_dir
780    target_out = sys.argv[1]
781    _fonts_dir = path.join(target_out, 'fonts')
782
783    fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
784
785    parse_fonts_xml(fonts_xml_path)
786
787    check_compact_only_fallback()
788
789    check_vertical_metrics()
790
791    hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
792    check_hyphens(hyphens_dir)
793
794    check_cjk_punctuation()
795
796    check_canonical_name()
797
798    check_emoji = sys.argv[2]
799    if check_emoji == 'true':
800        ucd_path = sys.argv[3]
801        parse_ucd(ucd_path)
802        all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
803        check_emoji_not_compat(all_emoji, equivalent_emoji)
804        check_emoji_coverage(all_emoji, equivalent_emoji)
805        check_emoji_defaults(default_emoji)
806
807
808if __name__ == '__main__':
809    main()
810