Update search crawler for python3

author Magnus Hagander <magnus@hagander.net>

Wed, 23 Jan 2019 21:14:56 +0000 (22:14 +0100)

committer Magnus Hagander <magnus@hagander.net>

Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
author Magnus Hagander <magnus@hagander.net>
Wed, 23 Jan 2019 21:14:56 +0000 (22:14 +0100)
committer Magnus Hagander <magnus@hagander.net>
Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
diff --git a/tools/search/crawler/lib/basecrawler.py b/tools/search/crawler/lib/basecrawler.py

index c7579cd39e94397f0cb8ffcdb2306842a2781850..e97b4c7557ddd02b654bd1e19dada4c3a866ebfc 100644 (file)
--- a/tools/search/crawler/lib/basecrawler.py
+++ b/tools/search/crawler/lib/basecrawler.py
@@ -1,15 +1,28 @@
  import datetime
-import httplib
  import time
  from email.utils import formatdate, parsedate
-import urlparse
-import ssl
+import urllib.parse
+import requests
+import urllib3
  
-from Queue import Queue
+from queue import Queue
  import threading
  
  from lib.log import log
-from lib.parsers import GenericHtmlParser, lossy_unicode
+from lib.parsers import GenericHtmlParser
+
+
+_orig_create_connection = urllib3.util.connection.create_connection
+
+
+def override_create_connection(hostname, ipaddr):
+    def _override(address, *args, **kwargs):
+        host, port = address
+        if host == hostname:
+            return _orig_create_connection((ipaddr, port), *args, **kwargs)
+        else:
+            return _orig_create_connection(address, *args, **kwargs)
+    urllib3.util.connection.create_connection = _override
  
  
  class BaseSiteCrawler(object):
@@ -25,6 +38,9 @@ class BaseSiteCrawler(object):
          self.pages_deleted = 0
          self.status_interval = 5
  
+        if serverip:
+            override_create_connection(hostname, serverip)
+
          curs = dbconn.cursor()
          curs.execute("SELECT suburl, lastscanned FROM webpages WHERE site=%(id)s AND lastscanned IS NOT NULL", {'id': siteid})
          self.scantimes = dict(curs.fetchall())
@@ -124,7 +140,6 @@ class BaseSiteCrawler(object):
              return
  
          # Try to convert pagedata to a unicode string
-        pagedata = lossy_unicode(pagedata)
          try:
              self.page = self.parse_html(pagedata)
          except Exception as e:
@@ -167,46 +182,42 @@ class BaseSiteCrawler(object):
  
      def fetch_page(self, url):
          try:
-            # Unfortunatley, persistent connections seem quite unreliable,
-            # so create a new one for each page.
-            if self.serverip:
-                if not self.https:
-                    h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
-                else:
-                    h = httplib.HTTPSConnection(host=self.serverip, port=443, strict=True, timeout=10, context=ssl._create_unverified_context())
-                h.putrequest("GET", url, skip_host=1)
-                h.putheader("Host", self.hostname)
-            else:
-                if not self.https:
-                    h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
-                else:
-                    h = httplib.HTTPSConnection(host=self.hostname, port=443, strict=True, timeout=10, context=ssl._create_unverified_context())
-                h.putrequest("GET", url)
-            h.putheader("User-agent", "pgsearch/0.2")
-            h.putheader("Connection", "close")
+            headers = {
+                'User-agent': 'pgsearch/0.2',
+            }
              if url in self.scantimes:
-                h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple())))
-            h.endheaders()
-            resp = h.getresponse()
+                headers["If-Modified-Since"] = formatdate(time.mktime(self.scantimes[url].timetuple()))
+
+            if self.serverip and False:
+                connectto = self.serverip
+                headers['Host'] = self.hostname
+            else:
+                connectto = self.hostname
+
+            resp = requests.get(
+                '{}://{}{}'.format(self.https and 'https' or 'http', connectto, url),
+                headers=headers,
+                timeout=10,
+            )
  
-            if resp.status == 200:
-                if not self.accept_contenttype(resp.getheader("content-type")):
+            if resp.status_code == 200:
+                if not self.accept_contenttype(resp.headers["content-type"]):
                      # Content-type we're not interested in
                      return (2, None, None)
-                return (0, resp.read(), self.get_date(resp.getheader("last-modified")))
-            elif resp.status == 304:
+                return (0, resp.text, self.get_date(resp.headers.get("last-modified", None)))
+            elif resp.status_code == 304:
                  # Not modified, so no need to reprocess, but also don't
                  # give an error message for it...
                  return (0, None, None)
-            elif resp.status == 301:
+            elif resp.status_code == 301:
                  # A redirect... So try again with the redirected-to URL
                  # We send this through our link resolver to deal with both
                  # absolute and relative URLs
-                if resp.getheader('location', '') == '':
+                if resp.headers.get('location', '') == '':
                      log("Url %s returned empty redirect" % url)
                      return (2, None, None)
  
-                for tgt in self.resolve_links([resp.getheader('location', '')], url):
+                for tgt in self.resolve_links([resp.header['location'], ], url):
                      return (1, tgt, None)
                  # No redirect at all found, becaue it was invalid?
                  return (2, None, None)
@@ -233,7 +244,7 @@ class BaseSiteCrawler(object):
  
      def resolve_links(self, links, pageurl):
          for x in links:
-            p = urlparse.urlsplit(x)
+            p = urllib.parse.urlsplit(x)
              if p.scheme in ("http", "https"):
                  if p.netloc != self.hostname:
                      # Remote link
@@ -252,10 +263,10 @@ class BaseSiteCrawler(object):
  
                  if p[2][0] == "/":
                      # Absolute link on this host, so just return it
-                    yield urlparse.urlunsplit(p)
+                    yield urllib.parse.urlunsplit(p)
                  else:
                      # Relative link
-                    yield urlparse.urljoin(pageurl, urlparse.urlunsplit(p))
+                    yield urllib.parse.urljoin(pageurl, urllib.parse.urlunsplit(p))
              else:
                  # Ignore unknown url schemes like mailto
                  pass
diff --git a/tools/search/crawler/lib/parsers.py b/tools/search/crawler/lib/parsers.py

index c19bf932cd04834a222f5292176730440f0e7698..8ea56eaf238627cd5c2c236f1e66534ba55ca50b 100644 (file)
--- a/tools/search/crawler/lib/parsers.py
+++ b/tools/search/crawler/lib/parsers.py
@@ -1,5 +1,5 @@
  import re
-import urllib
+import requests
  from io import StringIO
  import dateutil.parser
  from datetime import timedelta
@@ -61,12 +61,10 @@ class GenericHtmlParser(HTMLParser):
  class RobotsParser(object):
      def __init__(self, url):
          try:
-            u = urllib.urlopen(url)
-            txt = u.read()
-            u.close()
+            r = requests.get(url)
              self.disallows = []
              activeagent = False
-            for l in txt.splitlines():
+            for l in r.text.splitlines():
                  if l.lower().startswith("user-agent: ") and len(l) > 12:
                      if l[12] == "*" or l[12:20] == "pgsearch":
                          activeagent = True
@@ -83,15 +81,3 @@ class RobotsParser(object):
              if url.startswith(d):
                  return True
          return False
-
-
-# Convert a string to unicode, try utf8 first, then latin1, then give
-# up and do a best-effort utf8.
-def lossy_unicode(s):
-    try:
-        return str(s, 'utf8')
-    except UnicodeDecodeError:
-        try:
-            return str(s, 'latin1')
-        except UnicodeDecodeError:
-            return str(s, 'utf8', 'replace')
diff --git a/tools/search/crawler/lib/sitemapsite.py b/tools/search/crawler/lib/sitemapsite.py

index 4e98cfd18446999649cf7045823296ccb79d5b32..b4574457b028dafb2cf15ef0b603b03e98261c7c 100644 (file)
--- a/tools/search/crawler/lib/sitemapsite.py
+++ b/tools/search/crawler/lib/sitemapsite.py
@@ -1,6 +1,6 @@
-import urllib
  import xml.parsers.expat
  import dateutil.parser
+import requests
  
  from lib.log import log
  from lib.basecrawler import BaseSiteCrawler
@@ -10,7 +10,7 @@ class SitemapParser(object):
      def __init__(self):
          self.urls = []
  
-    def parse(self, f, internal=False):
+    def parse(self, data, internal=False):
          self.parser = xml.parsers.expat.ParserCreate()
          self.currenturl = ""
          self.currentprio = 0
@@ -25,7 +25,7 @@ class SitemapParser(object):
          self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)
          self.internal = internal
  
-        self.parser.ParseFile(f)
+        self.parser.Parse(data)
  
      def processelement(self, name, attrs):
          if name == "url":
@@ -67,19 +67,20 @@ class SitemapSiteCrawler(BaseSiteCrawler):
      def init_crawl(self):
          # Fetch the sitemap. We ignore robots.txt in this case, and
          # assume it's always under /sitemap.xml
-        u = urllib.urlopen("https://%s/sitemap.xml" % self.hostname)
+        r = requests.get("https://%s/sitemap.xml" % self.hostname)
+        if r.status_code != 200:
+            raise Exception("Could not load sitemap: %s" % r.status_code)
+
          p = SitemapParser()
-        p.parse(u)
-        u.close()
+        p.parse(r.text)
  
          # Attempt to fetch a sitempa_internal.xml. This is used to index
          # pages on our internal search engine that we don't want on
          # Google. They should also be excluded from default search
          # results (unless searching with a specific suburl)
-        u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname)
-        if u.getcode() == 200:
-            p.parse(u, True)
-        u.close()
+        r = requests.get("https://%s/sitemap_internal.xml" % self.hostname)
+        if r.status_code == 200:
+            p.parse(r.text, True)
  
          for url, prio, lastmod, internal in p.urls:
              # Advance 8 characters - length of https://.
author	Magnus Hagander <magnus@hagander.net>
	Wed, 23 Jan 2019 21:14:56 +0000 (22:14 +0100)
committer	Magnus Hagander <magnus@hagander.net>
	Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
tools/search/crawler/lib/basecrawler.py		patch \| blob \| blame \| history
tools/search/crawler/lib/parsers.py		patch \| blob \| blame \| history
tools/search/crawler/lib/sitemapsite.py		patch \| blob \| blame \| history