Update search crawler for python3
authorMagnus Hagander <magnus@hagander.net>
Wed, 23 Jan 2019 21:14:56 +0000 (22:14 +0100)
committerMagnus Hagander <magnus@hagander.net>
Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
This includes the switch to requests, but also a bunch of other changes.

While at it, fix it so we can do proper https validation.

tools/search/crawler/lib/basecrawler.py
tools/search/crawler/lib/parsers.py
tools/search/crawler/lib/sitemapsite.py

index c7579cd39e94397f0cb8ffcdb2306842a2781850..e97b4c7557ddd02b654bd1e19dada4c3a866ebfc 100644 (file)
@@ -1,15 +1,28 @@
 import datetime
-import httplib
 import time
 from email.utils import formatdate, parsedate
-import urlparse
-import ssl
+import urllib.parse
+import requests
+import urllib3
 
-from Queue import Queue
+from queue import Queue
 import threading
 
 from lib.log import log
-from lib.parsers import GenericHtmlParser, lossy_unicode
+from lib.parsers import GenericHtmlParser
+
+
+_orig_create_connection = urllib3.util.connection.create_connection
+
+
+def override_create_connection(hostname, ipaddr):
+    def _override(address, *args, **kwargs):
+        host, port = address
+        if host == hostname:
+            return _orig_create_connection((ipaddr, port), *args, **kwargs)
+        else:
+            return _orig_create_connection(address, *args, **kwargs)
+    urllib3.util.connection.create_connection = _override
 
 
 class BaseSiteCrawler(object):
@@ -25,6 +38,9 @@ class BaseSiteCrawler(object):
         self.pages_deleted = 0
         self.status_interval = 5
 
+        if serverip:
+            override_create_connection(hostname, serverip)
+
         curs = dbconn.cursor()
         curs.execute("SELECT suburl, lastscanned FROM webpages WHERE site=%(id)s AND lastscanned IS NOT NULL", {'id': siteid})
         self.scantimes = dict(curs.fetchall())
@@ -124,7 +140,6 @@ class BaseSiteCrawler(object):
             return
 
         # Try to convert pagedata to a unicode string
-        pagedata = lossy_unicode(pagedata)
         try:
             self.page = self.parse_html(pagedata)
         except Exception as e:
@@ -167,46 +182,42 @@ class BaseSiteCrawler(object):
 
     def fetch_page(self, url):
         try:
-            # Unfortunatley, persistent connections seem quite unreliable,
-            # so create a new one for each page.
-            if self.serverip:
-                if not self.https:
-                    h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
-                else:
-                    h = httplib.HTTPSConnection(host=self.serverip, port=443, strict=True, timeout=10, context=ssl._create_unverified_context())
-                h.putrequest("GET", url, skip_host=1)
-                h.putheader("Host", self.hostname)
-            else:
-                if not self.https:
-                    h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
-                else:
-                    h = httplib.HTTPSConnection(host=self.hostname, port=443, strict=True, timeout=10, context=ssl._create_unverified_context())
-                h.putrequest("GET", url)
-            h.putheader("User-agent", "pgsearch/0.2")
-            h.putheader("Connection", "close")
+            headers = {
+                'User-agent': 'pgsearch/0.2',
+            }
             if url in self.scantimes:
-                h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple())))
-            h.endheaders()
-            resp = h.getresponse()
+                headers["If-Modified-Since"] = formatdate(time.mktime(self.scantimes[url].timetuple()))
+
+            if self.serverip and False:
+                connectto = self.serverip
+                headers['Host'] = self.hostname
+            else:
+                connectto = self.hostname
+
+            resp = requests.get(
+                '{}://{}{}'.format(self.https and 'https' or 'http', connectto, url),
+                headers=headers,
+                timeout=10,
+            )
 
-            if resp.status == 200:
-                if not self.accept_contenttype(resp.getheader("content-type")):
+            if resp.status_code == 200:
+                if not self.accept_contenttype(resp.headers["content-type"]):
                     # Content-type we're not interested in
                     return (2, None, None)
-                return (0, resp.read(), self.get_date(resp.getheader("last-modified")))
-            elif resp.status == 304:
+                return (0, resp.text, self.get_date(resp.headers.get("last-modified", None)))
+            elif resp.status_code == 304:
                 # Not modified, so no need to reprocess, but also don't
                 # give an error message for it...
                 return (0, None, None)
-            elif resp.status == 301:
+            elif resp.status_code == 301:
                 # A redirect... So try again with the redirected-to URL
                 # We send this through our link resolver to deal with both
                 # absolute and relative URLs
-                if resp.getheader('location', '') == '':
+                if resp.headers.get('location', '') == '':
                     log("Url %s returned empty redirect" % url)
                     return (2, None, None)
 
-                for tgt in self.resolve_links([resp.getheader('location', '')], url):
+                for tgt in self.resolve_links([resp.header['location'], ], url):
                     return (1, tgt, None)
                 # No redirect at all found, becaue it was invalid?
                 return (2, None, None)
@@ -233,7 +244,7 @@ class BaseSiteCrawler(object):
 
     def resolve_links(self, links, pageurl):
         for x in links:
-            p = urlparse.urlsplit(x)
+            p = urllib.parse.urlsplit(x)
             if p.scheme in ("http", "https"):
                 if p.netloc != self.hostname:
                     # Remote link
@@ -252,10 +263,10 @@ class BaseSiteCrawler(object):
 
                 if p[2][0] == "/":
                     # Absolute link on this host, so just return it
-                    yield urlparse.urlunsplit(p)
+                    yield urllib.parse.urlunsplit(p)
                 else:
                     # Relative link
-                    yield urlparse.urljoin(pageurl, urlparse.urlunsplit(p))
+                    yield urllib.parse.urljoin(pageurl, urllib.parse.urlunsplit(p))
             else:
                 # Ignore unknown url schemes like mailto
                 pass
index c19bf932cd04834a222f5292176730440f0e7698..8ea56eaf238627cd5c2c236f1e66534ba55ca50b 100644 (file)
@@ -1,5 +1,5 @@
 import re
-import urllib
+import requests
 from io import StringIO
 import dateutil.parser
 from datetime import timedelta
@@ -61,12 +61,10 @@ class GenericHtmlParser(HTMLParser):
 class RobotsParser(object):
     def __init__(self, url):
         try:
-            u = urllib.urlopen(url)
-            txt = u.read()
-            u.close()
+            r = requests.get(url)
             self.disallows = []
             activeagent = False
-            for l in txt.splitlines():
+            for l in r.text.splitlines():
                 if l.lower().startswith("user-agent: ") and len(l) > 12:
                     if l[12] == "*" or l[12:20] == "pgsearch":
                         activeagent = True
@@ -83,15 +81,3 @@ class RobotsParser(object):
             if url.startswith(d):
                 return True
         return False
-
-
-# Convert a string to unicode, try utf8 first, then latin1, then give
-# up and do a best-effort utf8.
-def lossy_unicode(s):
-    try:
-        return str(s, 'utf8')
-    except UnicodeDecodeError:
-        try:
-            return str(s, 'latin1')
-        except UnicodeDecodeError:
-            return str(s, 'utf8', 'replace')
index 4e98cfd18446999649cf7045823296ccb79d5b32..b4574457b028dafb2cf15ef0b603b03e98261c7c 100644 (file)
@@ -1,6 +1,6 @@
-import urllib
 import xml.parsers.expat
 import dateutil.parser
+import requests
 
 from lib.log import log
 from lib.basecrawler import BaseSiteCrawler
@@ -10,7 +10,7 @@ class SitemapParser(object):
     def __init__(self):
         self.urls = []
 
-    def parse(self, f, internal=False):
+    def parse(self, data, internal=False):
         self.parser = xml.parsers.expat.ParserCreate()
         self.currenturl = ""
         self.currentprio = 0
@@ -25,7 +25,7 @@ class SitemapParser(object):
         self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)
         self.internal = internal
 
-        self.parser.ParseFile(f)
+        self.parser.Parse(data)
 
     def processelement(self, name, attrs):
         if name == "url":
@@ -67,19 +67,20 @@ class SitemapSiteCrawler(BaseSiteCrawler):
     def init_crawl(self):
         # Fetch the sitemap. We ignore robots.txt in this case, and
         # assume it's always under /sitemap.xml
-        u = urllib.urlopen("https://%s/sitemap.xml" % self.hostname)
+        r = requests.get("https://%s/sitemap.xml" % self.hostname)
+        if r.status_code != 200:
+            raise Exception("Could not load sitemap: %s" % r.status_code)
+
         p = SitemapParser()
-        p.parse(u)
-        u.close()
+        p.parse(r.text)
 
         # Attempt to fetch a sitempa_internal.xml. This is used to index
         # pages on our internal search engine that we don't want on
         # Google. They should also be excluded from default search
         # results (unless searching with a specific suburl)
-        u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname)
-        if u.getcode() == 200:
-            p.parse(u, True)
-        u.close()
+        r = requests.get("https://%s/sitemap_internal.xml" % self.hostname)
+        if r.status_code == 200:
+            p.parse(r.text, True)
 
         for url, prio, lastmod, internal in p.urls:
             # Advance 8 characters - length of https://.