From 9875fd8537b25ae69b67f0fd0ffdf8aada1ea250 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Sun, 20 Jan 2019 11:10:14 +0100 Subject: [PATCH] Attempt to fix localhtmlvalidate It was already broken and didn't work properly, but update to py3 and at least make it run. More changes are necessary to actually make it happy again. --- tools/localhtmlvalidate/localhtmlvalidate.py | 85 +++++++------------- 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/tools/localhtmlvalidate/localhtmlvalidate.py b/tools/localhtmlvalidate/localhtmlvalidate.py index 34869372..4332350b 100755 --- a/tools/localhtmlvalidate/localhtmlvalidate.py +++ b/tools/localhtmlvalidate/localhtmlvalidate.py @@ -14,31 +14,9 @@ # import sys -import urllib -import httplib +import requests import re -import HTMLParser - -BOUNDARY = "-=--=foobar-=--=" - - -def encode_multipart_formdata(fields, files): - L = [] - for (key, value) in fields: - L.append('--' + BOUNDARY) - L.append('Content-Disposition: form-data; name="%s"' % key) - L.append('') - L.append(value) - for (key, filename, value) in files: - L.append('--' + BOUNDARY) - L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) - L.append('Content-Type: text/html') - L.append('') - L.append(value) - L.append('--' + BOUNDARY + '--') - L.append('') - body = "\r\n".join(L) - return body +import html.parser if __name__ == "__main__": @@ -46,7 +24,8 @@ if __name__ == "__main__": print("Usage: localhtmlvalidate.py ") sys.exit(1) - contents = urllib.urlopen(sys.argv[1]).read() + r = requests.get(sys.argv[1]) + contents = r.text # Try to figure out where the actual contents start :) try: @@ -55,42 +34,40 @@ if __name__ == "__main__": firstline = 0 # Generate a form body - body = encode_multipart_formdata( - [ - ('charset', 'utf-8'), - ('doctype', 'inline'), - ('group', '0'), - ('verbose', '1'), - ], - [('uploaded_file', 'test.html', contents)] - ) + data = { + 'doctype': 'Inline', + 'group': '0', + 'verbose': '1', + 'prefill': '1', + 'prefill_doctype': 'html401', + 'fragment': contents, + } # Now submit it to the w3c validator - h = httplib.HTTP("validator.w3.org") - h.putrequest("POST", "/check") - h.putheader("User-Agent: localcheck-tester/0.0") - h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY) - h.putheader("content-length", str(len(body))) - h.endheaders() - h.send(body) - errcode, errmsg, headers = h.getreply() - rbody = h.getfile().read() - if headers['x-w3c-validator-status'] == 'Valid': + resp = requests.post( + 'https://validator.w3.org/check', + data=data, + headers={ + "User-Agent": "localcheck-tester/0.0", + }, + timeout=20, + ) + if resp.headers['x-w3c-validator-status'] == 'Valid': print("Page validates!") sys.exit(0) - elif headers['x-w3c-validator-status'] == 'Invalid': + elif resp.headers['x-w3c-validator-status'] == 'Invalid': print("Invalid!") - print("Errors: %s" % headers['x-w3c-validator-errors']) - print("Warnings: %s" % headers['x-w3c-validator-warnings']) - hp = HTMLParser.HTMLParser() - for m in re.findall('
  • .*?
  • ', rbody, re.DOTALL): - r = re.search('Line (\d+).*(.*?)', m, re.DOTALL) - print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2)))) - - r2 = re.search('(.*?)(.*?)(.*?)', unicode(m, 'utf8'), re.DOTALL) + print("Errors: %s" % resp.headers['x-w3c-validator-errors']) + print("Warnings: %s" % resp.headers['x-w3c-validator-warnings']) + hp = html.parser.HTMLParser() + for m in re.findall('
  • .*?
  • ', resp.text, re.DOTALL): + r = re.search('Line (\d+).*(.*?)', m, re.DOTALL) + if r: + print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2)))) + r2 = re.search('(.*?)(.*?)(.*?)', m, re.DOTALL) if r2: s = "%s%s%s" % r2.groups() - print("Source: %s" % hp.unescape(s).encode('utf-8')) + print("Source: %s" % hp.unescape(s)) print("") else: print("Unknown status: %s" % headers['x-w3c-validator-status']) -- 2.39.5