From 9875fd8537b25ae69b67f0fd0ffdf8aada1ea250 Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Sun, 20 Jan 2019 11:10:14 +0100
Subject: [PATCH] Attempt to fix localhtmlvalidate

It was already broken and didn't work properly, but update to py3 and at
least make it run. More changes are necessary to actually make it happy
again.
---
 tools/localhtmlvalidate/localhtmlvalidate.py | 85 +++++++-------------
 1 file changed, 31 insertions(+), 54 deletions(-)
diff --git a/tools/localhtmlvalidate/localhtmlvalidate.py b/tools/localhtmlvalidate/localhtmlvalidate.py
index 34869372..4332350b 100755
--- a/tools/localhtmlvalidate/localhtmlvalidate.py
+++ b/tools/localhtmlvalidate/localhtmlvalidate.py
@@ -14,31 +14,9 @@
 #
 
 import sys
-import urllib
-import httplib
+import requests
 import re
-import HTMLParser
-
-BOUNDARY = "-=--=foobar-=--="
-
-
-def encode_multipart_formdata(fields, files):
-    L = []
-    for (key, value) in fields:
-        L.append('--' + BOUNDARY)
-        L.append('Content-Disposition: form-data; name="%s"' % key)
-        L.append('')
-        L.append(value)
-    for (key, filename, value) in files:
-        L.append('--' + BOUNDARY)
-        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
-        L.append('Content-Type: text/html')
-        L.append('')
-        L.append(value)
-    L.append('--' + BOUNDARY + '--')
-    L.append('')
-    body = "\r\n".join(L)
-    return body
+import html.parser
 
 
 if __name__ == "__main__":
@@ -46,7 +24,8 @@ if __name__ == "__main__":
         print("Usage: localhtmlvalidate.py <local url>")
         sys.exit(1)
 
-    contents = urllib.urlopen(sys.argv[1]).read()
+    r = requests.get(sys.argv[1])
+    contents = r.text
 
     # Try to figure out where the actual contents start :)
     try:
@@ -55,42 +34,40 @@ if __name__ == "__main__":
         firstline = 0
 
     # Generate a form body
-    body = encode_multipart_formdata(
-        [
-            ('charset', 'utf-8'),
-            ('doctype', 'inline'),
-            ('group', '0'),
-            ('verbose', '1'),
-        ],
-        [('uploaded_file', 'test.html', contents)]
-    )
+    data = {
+        'doctype': 'Inline',
+        'group': '0',
+        'verbose': '1',
+        'prefill': '1',
+        'prefill_doctype': 'html401',
+        'fragment': contents,
+    }
 
     # Now submit it to the w3c validator
-    h = httplib.HTTP("validator.w3.org")
-    h.putrequest("POST", "/check")
-    h.putheader("User-Agent: localcheck-tester/0.0")
-    h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY)
-    h.putheader("content-length", str(len(body)))
-    h.endheaders()
-    h.send(body)
-    errcode, errmsg, headers = h.getreply()
-    rbody = h.getfile().read()
-    if headers['x-w3c-validator-status'] == 'Valid':
+    resp = requests.post(
+        'https://validator.w3.org/check',
+        data=data,
+        headers={
+            "User-Agent": "localcheck-tester/0.0",
+        },
+        timeout=20,
+    )
+    if resp.headers['x-w3c-validator-status'] == 'Valid':
         print("Page validates!")
         sys.exit(0)
-    elif headers['x-w3c-validator-status'] == 'Invalid':
+    elif resp.headers['x-w3c-validator-status'] == 'Invalid':
         print("Invalid!")
-        print("Errors: %s" % headers['x-w3c-validator-errors'])
-        print("Warnings: %s" % headers['x-w3c-validator-warnings'])
-        hp = HTMLParser.HTMLParser()
-        for m in re.findall('<li class="msg_err">.*?</li>', rbody, re.DOTALL):
-            r = re.search('<em>Line (\d+).*<span class="msg">(.*?)</span>', m, re.DOTALL)
-            print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2))))
-
-            r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', unicode(m, 'utf8'), re.DOTALL)
+        print("Errors: %s" % resp.headers['x-w3c-validator-errors'])
+        print("Warnings: %s" % resp.headers['x-w3c-validator-warnings'])
+        hp = html.parser.HTMLParser()
+        for m in re.findall('<li class="msg_err">.*?</li>', resp.text, re.DOTALL):
+            r = re.search('<em>Line <a href="[^"]+">(\d+)</a>.*<span class="msg">(.*?)</span>', m, re.DOTALL)
+            if r:
+                print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2))))
+            r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', m, re.DOTALL)
             if r2:
                 s = "%s%s%s" % r2.groups()
-                print("Source: %s" % hp.unescape(s).encode('utf-8'))
+                print("Source: %s" % hp.unescape(s))
             print("")
     else:
         print("Unknown status: %s" % headers['x-w3c-validator-status'])
-- 
2.39.5