diff options
| author | Andy Russell <arussell123@gmail.com> | 2019-01-05 14:17:51 -0500 |
|---|---|---|
| committer | Andy Russell <arussell123@gmail.com> | 2019-01-06 01:27:33 -0500 |
| commit | 6fefcee73a5dfee20281396e0f1418ceda81776b (patch) | |
| tree | 67f89e609c06fbb15c0c68271cb550be1843aabf /src/etc/htmldocck.py | |
| parent | 68614265d312fc2cbe8a696f7dabb9416eb6f221 (diff) | |
| download | rust-6fefcee73a5dfee20281396e0f1418ceda81776b.tar.gz rust-6fefcee73a5dfee20281396e0f1418ceda81776b.zip | |
use utf-8 throughout htmldocck
This commit improves compatibility with Python 3, which already uses Unicode throughout. It also fixes a subtle incompatibility stemming from the use of `entitydefs`, which contains replacement text _encoded in latin-1_ for HTML entities. When using Python 3, this would cause `0xa0` to be incorrectly added to the element tree. This meant that there was a rustdoc test that would pass under Python 2 but fail under Python 3, due to an incorrect regex match against the non-breaking space character. This commit triggers that failure in both versions, and also fixes it.
Diffstat (limited to 'src/etc/htmldocck.py')
| -rw-r--r-- | src/etc/htmldocck.py | 46 |
1 files changed, 28 insertions, 18 deletions
diff --git a/src/etc/htmldocck.py b/src/etc/htmldocck.py index ef41e426f28..e8be2b9b537 100644 --- a/src/etc/htmldocck.py +++ b/src/etc/htmldocck.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + r""" htmldocck.py is a custom checker script for Rustdoc HTML outputs. @@ -98,7 +101,10 @@ checks if the given file does not exist, for example. """ -from __future__ import print_function +from __future__ import absolute_import, print_function, unicode_literals + +import codecs +import io import sys import os.path import re @@ -110,14 +116,10 @@ except ImportError: from HTMLParser import HTMLParser from xml.etree import cElementTree as ET -# ⇤/⇥ are not in HTML 4 but are in HTML 5 try: - from html.entities import entitydefs + from html.entities import name2codepoint except ImportError: - from htmlentitydefs import entitydefs -entitydefs['larrb'] = u'\u21e4' -entitydefs['rarrb'] = u'\u21e5' -entitydefs['nbsp'] = ' ' + from htmlentitydefs import name2codepoint # "void elements" (no closing tag) from the HTML Standard section 12.1.2 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', @@ -157,11 +159,11 @@ class CustomHTMLParser(HTMLParser): self.__builder.data(data) def handle_entityref(self, name): - self.__builder.data(entitydefs[name]) + self.__builder.data(unichr(name2codepoint[name])) def handle_charref(self, name): code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10) - self.__builder.data(unichr(code).encode('utf-8')) + self.__builder.data(unichr(code)) def close(self): HTMLParser.close(self) @@ -210,11 +212,11 @@ LINE_PATTERN = re.compile(r''' (?<=(?<!\S)@)(?P<negated>!?) (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*) (?P<args>.*)$ -''', re.X) +''', re.X | re.UNICODE) def get_commands(template): - with open(template, 'rU') as f: + with io.open(template, encoding='utf-8') as f: for lineno, line in concat_multi_lines(f): m = LINE_PATTERN.search(line) if not m: @@ -226,7 +228,10 @@ def get_commands(template): if args and not args[:1].isspace(): print_err(lineno, line, 'Invalid template syntax') continue - args = shlex.split(args) + try: + args = shlex.split(args) + except UnicodeEncodeError: + args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))] yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line) @@ -280,7 +285,7 @@ class CachedFiles(object): if not(os.path.exists(abspath) and os.path.isfile(abspath)): raise FailedCheck('File does not exist {!r}'.format(path)) - with open(abspath) as f: + with io.open(abspath, encoding='utf-8') as f: data = f.read() self.files[path] = data return data @@ -294,9 +299,9 @@ class CachedFiles(object): if not(os.path.exists(abspath) and os.path.isfile(abspath)): raise FailedCheck('File does not exist {!r}'.format(path)) - with open(abspath) as f: + with io.open(abspath, encoding='utf-8') as f: try: - tree = ET.parse(f, CustomHTMLParser()) + tree = ET.fromstringlist(f.readlines(), CustomHTMLParser()) except Exception as e: raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e)) self.trees[path] = tree @@ -313,7 +318,7 @@ def check_string(data, pat, regexp): if not pat: return True # special case a presence testing elif regexp: - return re.search(pat, data) is not None + return re.search(pat, data, flags=re.UNICODE) is not None else: data = ' '.join(data.split()) pat = ' '.join(pat.split()) @@ -350,7 +355,7 @@ def check_tree_text(tree, path, pat, regexp): break except Exception as e: print('Failed to get path "{}"'.format(path)) - raise e + raise return ret @@ -359,7 +364,12 @@ def get_tree_count(tree, path): return len(tree.findall(path)) def stderr(*args): - print(*args, file=sys.stderr) + if sys.version_info.major < 3: + file = codecs.getwriter('utf-8')(sys.stderr) + else: + file = sys.stderr + + print(*args, file=file) def print_err(lineno, context, err, message=None): global ERR_COUNT |
