From: Magnus Hagander Date: Sat, 19 Jan 2019 19:23:16 +0000 (+0100) Subject: Remove indexer for old archives X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=e0fcc082ee4482801b213fba8d420def776a54cf;p=pgweb.git Remove indexer for old archives We have been using the new archives for many years now, so remove this instead of trying to port it to python3. --- diff --git a/tools/search/crawler/lib/archives.py b/tools/search/crawler/lib/archives.py deleted file mode 100644 index 2235dbb3..00000000 --- a/tools/search/crawler/lib/archives.py +++ /dev/null @@ -1,168 +0,0 @@ -import datetime -import httplib -from Queue import Queue -import threading -import sys -import time - -from lib.log import log -from lib.parsers import ArchivesParser - - -class MultiListCrawler(object): - def __init__(self, lists, conn, status_interval=30, commit_interval=500): - self.lists = lists - self.conn = conn - self.status_interval = status_interval - self.commit_interval = commit_interval - - self.queue = Queue() - self.counter = 0 - self.counterlock = threading.RLock() - self.stopevent = threading.Event() - - def crawl(self, full=False, month=None): - # Each thread can independently run on one month, so we can get - # a reasonable spread. Therefor, submit them as separate jobs - # to the queue. - for listid, listname in self.lists: - if full: - # Generate a sequence of everything to index - for year in range(1997, datetime.datetime.now().year + 1): - for month in range(1, 13): - self.queue.put((listid, listname, year, month, -1)) - elif month: - # Do one specific month - pieces = month.split("-") - if len(pieces) != 2: - print("Month format is -, cannot parse '%s'" % month) - sys.exit(1) - try: - pieces = [int(x) for x in pieces] - except: - print("Month format is -, cannot convert '%s' to integers" % month) - sys.exit(1) - self.queue.put((listid, listname, pieces[0], pieces[1], -1)) - else: - # In incremental scan, we check the current month and the - # previous one, but only for new messages. - curs = self.conn.cursor() - curr = datetime.date.today() - if curr.month == 1: - prev = datetime.date(curr.year - 1, 12, 1) - else: - prev = datetime.date(curr.year, curr.month - 1, 1) - - for d in curr, prev: - # Figure out what the highest indexed page in this - # month is. - curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", { - 'list': listid, - 'year': d.year, - 'month': d.month, - }) - x = curs.fetchall() - if x[0][0] is not None: - maxmsg = x[0][0] - else: - maxmsg = -1 - self.queue.put((listid, listname, d.year, d.month, maxmsg)) - - for x in range(5): - t = threading.Thread(name="Indexer %s" % x, - target=lambda: self.crawl_from_queue()) - t.daemon = True - t.start() - - t = threading.Thread(name="statusthread", target=lambda: self.status_thread()) - t.daemon = True - t.start() - - # XXX: need to find a way to deal with all threads crashed and - # not done here yet! - self.queue.join() - self.stopevent.set() - - return self.counter - - def status_thread(self): - lastcommit = 0 - starttime = time.time() - while not self.stopevent.is_set(): - self.stopevent.wait(self.status_interval) - nowtime = time.time() - with self.counterlock: - log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % ( - self.counter, - threading.active_count() - 2, # main thread + status thread - self.queue.qsize(), - self.counter / (nowtime - starttime), - )) - # Commit every 500 messages - if self.counter - lastcommit > self.commit_interval: - lastcommit = self.counter - self.conn.commit() - - def crawl_from_queue(self): - while not self.stopevent.is_set(): - (listid, listname, year, month, maxmsg) = self.queue.get() - self.crawl_month(listid, listname, year, month, maxmsg) - self.queue.task_done() - - def crawl_month(self, listid, listname, year, month, maxmsg): - currentmsg = maxmsg - while True: - currentmsg += 1 - try: - if not self.crawl_single_message(listid, listname, year, month, currentmsg): - break - except Exception as e: - log("Exception when crawling %s/%s/%s/%s - %s" % ( - listname, year, month, currentmsg, e)) - # Continue on to try the next message - - def crawl_single_message(self, listid, listname, year, month, msgnum): - curs = self.conn.cursor() - h = httplib.HTTPConnection(host="archives.postgresql.org", - port=80, - strict=True, - timeout=10) - url = "/%s/%04d-%02d/msg%05d.php" % ( - listname, - year, - month, - msgnum) - h.putrequest("GET", url) - h.putheader("User-agent", "pgsearch/0.2") - h.putheader("Connection", "close") - h.endheaders() - resp = h.getresponse() - txt = resp.read() - h.close() - - if resp.status == 404: - # Past the end of the month - return False - elif resp.status != 200: - raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, resp.status)) - - # Else we have the message! - p = ArchivesParser() - if not p.parse(txt): - log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum)) - # We return true to move on to the next message anyway! - return True - curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", { - 'listid': listid, - 'year': year, - 'month': month, - 'msgnum': msgnum, - 'date': p.date, - 'subject': p.subject[:127], - 'author': p.author[:127], - 'txt': p.body, - }) - with self.counterlock: - self.counter += 1 - - return True diff --git a/tools/search/crawler/lib/parsers.py b/tools/search/crawler/lib/parsers.py index 369350ea..4315548d 100644 --- a/tools/search/crawler/lib/parsers.py +++ b/tools/search/crawler/lib/parsers.py @@ -58,85 +58,6 @@ class GenericHtmlParser(HTMLParser): return self.pagedata.read() -class ArchivesParser(object): - rematcher = re.compile(".*.*.*(.*)", re.DOTALL) - hp = HTMLParser() - - def __init__(self): - self.subject = None - self.author = None - self.date = None - self.body = None - - def parse(self, contents): - contents = lossy_unicode(contents) - match = self.rematcher.search(contents) - if not match: - return False - self.subject = self.hp.unescape(match.group(1)) - self.author = self.almost_rot13(self.hp.unescape(match.group(2))) - if not self.parse_date(self.hp.unescape(match.group(3))): - return False - self.body = self.hp.unescape(match.group(4)) - return True - - _date_multi_re = re.compile(' \((\w+\s\w+|)\)$') - _date_trailing_envelope = re.compile('\s+\(envelope.*\)$') - - def parse_date(self, d): - # For some reason, we have dates that look like this: - # http://archives.postgresql.org/pgsql-bugs/1999-05/msg00018.php - # Looks like an mhonarc bug, but let's just remove that trailing - # stuff here to be sure... - if self._date_trailing_envelope.search(d): - d = self._date_trailing_envelope.sub('', d) - - # We have a number of dates in the format - # " +0200 (MET DST)" - # or similar. The problem coming from the space within the - # parenthesis, or if the contents of the parenthesis is - # completely empty - if self._date_multi_re.search(d): - d = self._date_multi_re.sub('', d) - # Isn't it wonderful with a string with a trailing quote but no - # leading quote? MUA's are weird... - if d.endswith('"') and not d.startswith('"'): - d = d[:-1] - - # We also have "known incorrect timezone specs". - if d.endswith('MST7MDT'): - d = d[:-4] - elif d.endswith('METDST'): - d = d[:-3] - elif d.endswith('"MET'): - d = d[:-4] + "MET" - - try: - self.date = dateutil.parser.parse(d) - except ValueError: - log("Failed to parse date '%s'" % d) - return False - - if self.date.utcoffset(): - # We have some messages with completely incorrect utc offsets, - # so we need to reject those too - if self.date.utcoffset() > timedelta(hours=12) or self.date.utcoffset() < timedelta(hours=-12): - log("Failed to parse date %s', timezone offset out of range." % d) - return False - - return True - - # Semi-hacked rot13, because the one used by mhonarc is broken. - # So we copy the brokenness here. - # This code is from MHonArc/ewhutil.pl, mrot13() - _arot13_trans = dict(list(zip(list(map(ord, - '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[abcdefghijklmnopqrstuvwxyz')), - 'NOPQRSTUVWXYZ[@ABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm'))) - - def almost_rot13(self, s): - return str(s).translate(self._arot13_trans) - - class RobotsParser(object): def __init__(self, url): try: diff --git a/tools/search/crawler/listcrawler.py b/tools/search/crawler/listcrawler.py deleted file mode 100755 index 1a416079..00000000 --- a/tools/search/crawler/listcrawler.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from lib.log import log -from lib.archives import MultiListCrawler -from lib.threadwrapper import threadwrapper -from ConfigParser import ConfigParser -from optparse import OptionParser -import psycopg2 -import sys -import time - - -def doit(opt): - cp = ConfigParser() - cp.read("search.ini") - psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) - conn = psycopg2.connect(cp.get("search", "db")) - - curs = conn.cursor() - - if opt.list: - # Multiple lists can be specified with a comma separator (no spaces) - curs.execute("SELECT id,name FROM lists WHERE name=ANY(%(names)s)", { - 'names': opt.list.split(','), - }) - else: - curs.execute("SELECT id,name FROM lists WHERE active ORDER BY id") - - listinfo = [(id, name) for id, name in curs.fetchall()] - c = MultiListCrawler(listinfo, conn, opt.status_interval, opt.commit_interval) - n = c.crawl(opt.full, opt.month) - - # Update total counts - curs.execute("WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list") - # Indicate when we crawled - curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP") - conn.commit() - - log("Indexed %s messages" % n) - time.sleep(1) - - -if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-l", "--list", dest='list', help="Crawl only this list") - parser.add_option("-m", "--month", dest='month', help="Crawl only this month") - parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl") - parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates") - parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit") - - (opt, args) = parser.parse_args() - - if opt.full and opt.month: - print("Can't use both full and specific month!") - sys.exit(1) - - # assign default values - opt.status_interval = opt.status_interval and int(opt.status_interval) or 30 - opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500 - - threadwrapper(doit, opt) diff --git a/tools/search/crawler/listsync.py b/tools/search/crawler/listsync.py deleted file mode 100755 index 10da49e4..00000000 --- a/tools/search/crawler/listsync.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from lib.log import log -from ConfigParser import ConfigParser -import psycopg2 -import urllib -import simplejson as json - -if __name__ == "__main__": - cp = ConfigParser() - cp.read("search.ini") - psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) - conn = psycopg2.connect(cp.get("search", "db")) - curs = conn.cursor() - - u = urllib.urlopen("http://%s/community/lists/listinfo/" % cp.get("search", "web")) - obj = json.load(u) - u.close() - - # We don't care about the groups here, just the lists! - curs.execute("SELECT id, name, grp, active FROM lists") - lists = curs.fetchall() - for id, name, groupid, active in lists: - thislist = [x for x in obj['lists'] if x['id'] == id] - if len(thislist) == 0: - log("List %s should be removed, do that manually!" % name) - else: - # Compare contents of list - l = thislist[0] - if l['name'] != name: - log("Renaming list %s -> %s" % (name, l['name'])) - curs.execute("UPDATE lists SET name=%(name)s WHERE id=%(id)s", l) - - if thislist[0]['active'] != active: - log("Changing active flag for %s to %s" % (l['name'], l['active'])) - curs.execute("UPDATE lists SET active=%(active)s WHERE id=%(id)s", l) - if thislist[0]['groupid'] != groupid: - log("Changing group for %s to %s" % (l['name'], l['groupid'])) - curs.execute("UPDATE lists SET grp=%(groupid)s WHERE id=%(id)s", l) - - for l in obj['lists']: - thislist = [x for x in lists if x[0] == l['id']] - if len(thislist) == 0: - log("Adding list %s" % l['name']) - curs.execute("INSERT INTO lists (id, name, grp, active, pagecount) VALUES (%(id)s, %(name)s, %(groupid)s, %(active)s, 0)", - l) - - conn.commit()