Attempt to fix localhtmlvalidate
authorMagnus Hagander <magnus@hagander.net>
Sun, 20 Jan 2019 10:10:14 +0000 (11:10 +0100)
committerMagnus Hagander <magnus@hagander.net>
Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
It was already broken and didn't work properly, but update to py3 and at
least make it run. More changes are necessary to actually make it happy
again.

tools/localhtmlvalidate/localhtmlvalidate.py

index 348693720db32da5497460e44edffdc042feaeba..4332350bf78f18e45cfeda5e944a05accd917580 100755 (executable)
 #
 
 import sys
-import urllib
-import httplib
+import requests
 import re
-import HTMLParser
-
-BOUNDARY = "-=--=foobar-=--="
-
-
-def encode_multipart_formdata(fields, files):
-    L = []
-    for (key, value) in fields:
-        L.append('--' + BOUNDARY)
-        L.append('Content-Disposition: form-data; name="%s"' % key)
-        L.append('')
-        L.append(value)
-    for (key, filename, value) in files:
-        L.append('--' + BOUNDARY)
-        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
-        L.append('Content-Type: text/html')
-        L.append('')
-        L.append(value)
-    L.append('--' + BOUNDARY + '--')
-    L.append('')
-    body = "\r\n".join(L)
-    return body
+import html.parser
 
 
 if __name__ == "__main__":
@@ -46,7 +24,8 @@ if __name__ == "__main__":
         print("Usage: localhtmlvalidate.py <local url>")
         sys.exit(1)
 
-    contents = urllib.urlopen(sys.argv[1]).read()
+    r = requests.get(sys.argv[1])
+    contents = r.text
 
     # Try to figure out where the actual contents start :)
     try:
@@ -55,42 +34,40 @@ if __name__ == "__main__":
         firstline = 0
 
     # Generate a form body
-    body = encode_multipart_formdata(
-        [
-            ('charset', 'utf-8'),
-            ('doctype', 'inline'),
-            ('group', '0'),
-            ('verbose', '1'),
-        ],
-        [('uploaded_file', 'test.html', contents)]
-    )
+    data = {
+        'doctype': 'Inline',
+        'group': '0',
+        'verbose': '1',
+        'prefill': '1',
+        'prefill_doctype': 'html401',
+        'fragment': contents,
+    }
 
     # Now submit it to the w3c validator
-    h = httplib.HTTP("validator.w3.org")
-    h.putrequest("POST", "/check")
-    h.putheader("User-Agent: localcheck-tester/0.0")
-    h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY)
-    h.putheader("content-length", str(len(body)))
-    h.endheaders()
-    h.send(body)
-    errcode, errmsg, headers = h.getreply()
-    rbody = h.getfile().read()
-    if headers['x-w3c-validator-status'] == 'Valid':
+    resp = requests.post(
+        'https://validator.w3.org/check',
+        data=data,
+        headers={
+            "User-Agent": "localcheck-tester/0.0",
+        },
+        timeout=20,
+    )
+    if resp.headers['x-w3c-validator-status'] == 'Valid':
         print("Page validates!")
         sys.exit(0)
-    elif headers['x-w3c-validator-status'] == 'Invalid':
+    elif resp.headers['x-w3c-validator-status'] == 'Invalid':
         print("Invalid!")
-        print("Errors: %s" % headers['x-w3c-validator-errors'])
-        print("Warnings: %s" % headers['x-w3c-validator-warnings'])
-        hp = HTMLParser.HTMLParser()
-        for m in re.findall('<li class="msg_err">.*?</li>', rbody, re.DOTALL):
-            r = re.search('<em>Line (\d+).*<span class="msg">(.*?)</span>', m, re.DOTALL)
-            print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2))))
-
-            r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', unicode(m, 'utf8'), re.DOTALL)
+        print("Errors: %s" % resp.headers['x-w3c-validator-errors'])
+        print("Warnings: %s" % resp.headers['x-w3c-validator-warnings'])
+        hp = html.parser.HTMLParser()
+        for m in re.findall('<li class="msg_err">.*?</li>', resp.text, re.DOTALL):
+            r = re.search('<em>Line <a href="[^"]+">(\d+)</a>.*<span class="msg">(.*?)</span>', m, re.DOTALL)
+            if r:
+                print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2))))
+            r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', m, re.DOTALL)
             if r2:
                 s = "%s%s%s" % r2.groups()
-                print("Source: %s" % hp.unescape(s).encode('utf-8'))
+                print("Source: %s" % hp.unescape(s))
             print("")
     else:
         print("Unknown status: %s" % headers['x-w3c-validator-status'])