Remove indexer for old archives

author Magnus Hagander <magnus@hagander.net>

Sat, 19 Jan 2019 19:23:16 +0000 (20:23 +0100)

committer Magnus Hagander <magnus@hagander.net>

Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
author Magnus Hagander <magnus@hagander.net>
Sat, 19 Jan 2019 19:23:16 +0000 (20:23 +0100)
committer Magnus Hagander <magnus@hagander.net>
Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
diff --git a/tools/search/crawler/lib/archives.py b/tools/search/crawler/lib/archives.py

deleted file mode 100644 (file)

index 2235dbb..0000000
--- a/tools/search/crawler/lib/archives.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import datetime
-import httplib
-from Queue import Queue
-import threading
-import sys
-import time
-
-from lib.log import log
-from lib.parsers import ArchivesParser
-
-
-class MultiListCrawler(object):
-    def __init__(self, lists, conn, status_interval=30, commit_interval=500):
-        self.lists = lists
-        self.conn = conn
-        self.status_interval = status_interval
-        self.commit_interval = commit_interval
-
-        self.queue = Queue()
-        self.counter = 0
-        self.counterlock = threading.RLock()
-        self.stopevent = threading.Event()
-
-    def crawl(self, full=False, month=None):
-        # Each thread can independently run on one month, so we can get
-        # a reasonable spread. Therefor, submit them as separate jobs
-        # to the queue.
-        for listid, listname in self.lists:
-            if full:
-                # Generate a sequence of everything to index
-                for year in range(1997, datetime.datetime.now().year + 1):
-                    for month in range(1, 13):
-                        self.queue.put((listid, listname, year, month, -1))
-            elif month:
-                # Do one specific month
-                pieces = month.split("-")
-                if len(pieces) != 2:
-                    print("Month format is <y>-<m>, cannot parse '%s'" % month)
-                    sys.exit(1)
-                try:
-                    pieces = [int(x) for x in pieces]
-                except:
-                    print("Month format is <y>-<m>, cannot convert '%s' to integers" % month)
-                    sys.exit(1)
-                self.queue.put((listid, listname, pieces[0], pieces[1], -1))
-            else:
-                # In incremental scan, we check the current month and the
-                # previous one, but only for new messages.
-                curs = self.conn.cursor()
-                curr = datetime.date.today()
-                if curr.month == 1:
-                    prev = datetime.date(curr.year - 1, 12, 1)
-                else:
-                    prev = datetime.date(curr.year, curr.month - 1, 1)
-
-                for d in curr, prev:
-                    # Figure out what the highest indexed page in this
-                    # month is.
-                    curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", {
-                        'list': listid,
-                        'year': d.year,
-                        'month': d.month,
-                    })
-                    x = curs.fetchall()
-                    if x[0][0] is not None:
-                        maxmsg = x[0][0]
-                    else:
-                        maxmsg = -1
-                    self.queue.put((listid, listname, d.year, d.month, maxmsg))
-
-        for x in range(5):
-            t = threading.Thread(name="Indexer %s" % x,
-                                 target=lambda: self.crawl_from_queue())
-            t.daemon = True
-            t.start()
-
-        t = threading.Thread(name="statusthread", target=lambda: self.status_thread())
-        t.daemon = True
-        t.start()
-
-        # XXX: need to find a way to deal with all threads crashed and
-        # not done here yet!
-        self.queue.join()
-        self.stopevent.set()
-
-        return self.counter
-
-    def status_thread(self):
-        lastcommit = 0
-        starttime = time.time()
-        while not self.stopevent.is_set():
-            self.stopevent.wait(self.status_interval)
-            nowtime = time.time()
-            with self.counterlock:
-                log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % (
-                    self.counter,
-                    threading.active_count() - 2,  # main thread + status thread
-                    self.queue.qsize(),
-                    self.counter / (nowtime - starttime),
-                ))
-                # Commit every 500 messages
-                if self.counter - lastcommit > self.commit_interval:
-                    lastcommit = self.counter
-                    self.conn.commit()
-
-    def crawl_from_queue(self):
-        while not self.stopevent.is_set():
-            (listid, listname, year, month, maxmsg) = self.queue.get()
-            self.crawl_month(listid, listname, year, month, maxmsg)
-            self.queue.task_done()
-
-    def crawl_month(self, listid, listname, year, month, maxmsg):
-        currentmsg = maxmsg
-        while True:
-            currentmsg += 1
-            try:
-                if not self.crawl_single_message(listid, listname, year, month, currentmsg):
-                    break
-            except Exception as e:
-                log("Exception when crawling %s/%s/%s/%s - %s" % (
-                    listname, year, month, currentmsg, e))
-                # Continue on to try the next message
-
-    def crawl_single_message(self, listid, listname, year, month, msgnum):
-        curs = self.conn.cursor()
-        h = httplib.HTTPConnection(host="archives.postgresql.org",
-                                   port=80,
-                                   strict=True,
-                                   timeout=10)
-        url = "/%s/%04d-%02d/msg%05d.php" % (
-            listname,
-            year,
-            month,
-            msgnum)
-        h.putrequest("GET", url)
-        h.putheader("User-agent", "pgsearch/0.2")
-        h.putheader("Connection", "close")
-        h.endheaders()
-        resp = h.getresponse()
-        txt = resp.read()
-        h.close()
-
-        if resp.status == 404:
-            # Past the end of the month
-            return False
-        elif resp.status != 200:
-            raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, resp.status))
-
-        # Else we have the message!
-        p = ArchivesParser()
-        if not p.parse(txt):
-            log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum))
-            # We return true to move on to the next message anyway!
-            return True
-        curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", {
-            'listid': listid,
-            'year': year,
-            'month': month,
-            'msgnum': msgnum,
-            'date': p.date,
-            'subject': p.subject[:127],
-            'author': p.author[:127],
-            'txt': p.body,
-        })
-        with self.counterlock:
-            self.counter += 1
-
-        return True
diff --git a/tools/search/crawler/lib/parsers.py b/tools/search/crawler/lib/parsers.py

index 369350ea222c41a7a805a4ca476f8535b8448748..4315548dddfb37c30f793e88479c5e76ec262fef 100644 (file)
--- a/tools/search/crawler/lib/parsers.py
+++ b/tools/search/crawler/lib/parsers.py
@@ -58,85 +58,6 @@ class GenericHtmlParser(HTMLParser):
          return self.pagedata.read()
  
  
-class ArchivesParser(object):
-    rematcher = re.compile("<!--X-Subject: ([^\n]*) -->.*<!--X-From-R13: ([^\n]*) -->.*<!--X-Date: ([^\n]*) -->.*<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->", re.DOTALL)
-    hp = HTMLParser()
-
-    def __init__(self):
-        self.subject = None
-        self.author = None
-        self.date = None
-        self.body = None
-
-    def parse(self, contents):
-        contents = lossy_unicode(contents)
-        match = self.rematcher.search(contents)
-        if not match:
-            return False
-        self.subject = self.hp.unescape(match.group(1))
-        self.author = self.almost_rot13(self.hp.unescape(match.group(2)))
-        if not self.parse_date(self.hp.unescape(match.group(3))):
-            return False
-        self.body = self.hp.unescape(match.group(4))
-        return True
-
-    _date_multi_re = re.compile(' \((\w+\s\w+|)\)$')
-    _date_trailing_envelope = re.compile('\s+\(envelope.*\)$')
-
-    def parse_date(self, d):
-        # For some reason, we have dates that look like this:
-        # http://archives.postgresql.org/pgsql-bugs/1999-05/msg00018.php
-        # Looks like an mhonarc bug, but let's just remove that trailing
-        # stuff here to be sure...
-        if self._date_trailing_envelope.search(d):
-            d = self._date_trailing_envelope.sub('', d)
-
-        # We have a number of dates in the format
-        # "<full datespace> +0200 (MET DST)"
-        # or similar. The problem coming from the space within the
-        # parenthesis, or if the contents of the parenthesis is
-        # completely empty
-        if self._date_multi_re.search(d):
-            d = self._date_multi_re.sub('', d)
-        # Isn't it wonderful with a string with a trailing quote but no
-        # leading quote? MUA's are weird...
-        if d.endswith('"') and not d.startswith('"'):
-            d = d[:-1]
-
-        # We also have "known incorrect timezone specs".
-        if d.endswith('MST7MDT'):
-            d = d[:-4]
-        elif d.endswith('METDST'):
-            d = d[:-3]
-        elif d.endswith('"MET'):
-            d = d[:-4] + "MET"
-
-        try:
-            self.date = dateutil.parser.parse(d)
-        except ValueError:
-            log("Failed to parse date '%s'" % d)
-            return False
-
-        if self.date.utcoffset():
-            # We have some messages with completely incorrect utc offsets,
-            # so we need to reject those too
-            if self.date.utcoffset() > timedelta(hours=12) or self.date.utcoffset() < timedelta(hours=-12):
-                log("Failed to parse date %s', timezone offset out of range." % d)
-                return False
-
-        return True
-
-    # Semi-hacked rot13, because the one used by mhonarc is broken.
-    # So we copy the brokenness here.
-    # This code is from MHonArc/ewhutil.pl, mrot13()
-    _arot13_trans = dict(list(zip(list(map(ord,
-                                           '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[abcdefghijklmnopqrstuvwxyz')),
-                                  'NOPQRSTUVWXYZ[@ABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm')))
-
-    def almost_rot13(self, s):
-        return str(s).translate(self._arot13_trans)
-
-
  class RobotsParser(object):
      def __init__(self, url):
          try:
diff --git a/tools/search/crawler/listcrawler.py b/tools/search/crawler/listcrawler.py

deleted file mode 100755 (executable)

index 1a41607..0000000
--- a/tools/search/crawler/listcrawler.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from lib.log import log
-from lib.archives import MultiListCrawler
-from lib.threadwrapper import threadwrapper
-from ConfigParser import ConfigParser
-from optparse import OptionParser
-import psycopg2
-import sys
-import time
-
-
-def doit(opt):
-    cp = ConfigParser()
-    cp.read("search.ini")
-    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
-    conn = psycopg2.connect(cp.get("search", "db"))
-
-    curs = conn.cursor()
-
-    if opt.list:
-        # Multiple lists can be specified with a comma separator (no spaces)
-        curs.execute("SELECT id,name FROM lists WHERE name=ANY(%(names)s)", {
-            'names': opt.list.split(','),
-        })
-    else:
-        curs.execute("SELECT id,name FROM lists WHERE active ORDER BY id")
-
-    listinfo = [(id, name) for id, name in curs.fetchall()]
-    c = MultiListCrawler(listinfo, conn, opt.status_interval, opt.commit_interval)
-    n = c.crawl(opt.full, opt.month)
-
-    # Update total counts
-    curs.execute("WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list")
-    # Indicate when we crawled
-    curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP")
-    conn.commit()
-
-    log("Indexed %s messages" % n)
-    time.sleep(1)
-
-
-if __name__ == "__main__":
-    parser = OptionParser()
-    parser.add_option("-l", "--list", dest='list', help="Crawl only this list")
-    parser.add_option("-m", "--month", dest='month', help="Crawl only this month")
-    parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl")
-    parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates")
-    parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit")
-
-    (opt, args) = parser.parse_args()
-
-    if opt.full and opt.month:
-        print("Can't use both full and specific month!")
-        sys.exit(1)
-
-    # assign default values
-    opt.status_interval = opt.status_interval and int(opt.status_interval) or 30
-    opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500
-
-    threadwrapper(doit, opt)
diff --git a/tools/search/crawler/listsync.py b/tools/search/crawler/listsync.py

deleted file mode 100755 (executable)

index 10da49e..0000000
--- a/tools/search/crawler/listsync.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from lib.log import log
-from ConfigParser import ConfigParser
-import psycopg2
-import urllib
-import simplejson as json
-
-if __name__ == "__main__":
-    cp = ConfigParser()
-    cp.read("search.ini")
-    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
-    conn = psycopg2.connect(cp.get("search", "db"))
-    curs = conn.cursor()
-
-    u = urllib.urlopen("http://%s/community/lists/listinfo/" % cp.get("search", "web"))
-    obj = json.load(u)
-    u.close()
-
-    # We don't care about the groups here, just the lists!
-    curs.execute("SELECT id, name, grp, active FROM lists")
-    lists = curs.fetchall()
-    for id, name, groupid, active in lists:
-        thislist = [x for x in obj['lists'] if x['id'] == id]
-        if len(thislist) == 0:
-            log("List %s should be removed, do that manually!" % name)
-        else:
-            # Compare contents of list
-            l = thislist[0]
-            if l['name'] != name:
-                log("Renaming list %s -> %s" % (name, l['name']))
-                curs.execute("UPDATE lists SET name=%(name)s WHERE id=%(id)s", l)
-
-            if thislist[0]['active'] != active:
-                log("Changing active flag for %s to %s" % (l['name'], l['active']))
-                curs.execute("UPDATE lists SET active=%(active)s WHERE id=%(id)s", l)
-            if thislist[0]['groupid'] != groupid:
-                log("Changing group for %s to %s" % (l['name'], l['groupid']))
-                curs.execute("UPDATE lists SET grp=%(groupid)s WHERE id=%(id)s", l)
-
-    for l in obj['lists']:
-        thislist = [x for x in lists if x[0] == l['id']]
-        if len(thislist) == 0:
-            log("Adding list %s" % l['name'])
-            curs.execute("INSERT INTO lists (id, name, grp, active, pagecount) VALUES (%(id)s, %(name)s, %(groupid)s, %(active)s, 0)",
-                         l)
-
-    conn.commit()
author	Magnus Hagander <magnus@hagander.net>
	Sat, 19 Jan 2019 19:23:16 +0000 (20:23 +0100)
committer	Magnus Hagander <magnus@hagander.net>
	Sat, 26 Jan 2019 15:19:26 +0000 (16:19 +0100)
tools/search/crawler/lib/archives.py	[deleted file]	patch \| blob \| blame \| history
tools/search/crawler/lib/parsers.py		patch \| blob \| blame \| history
tools/search/crawler/listcrawler.py	[deleted file]	patch \| blob \| blame \| history
tools/search/crawler/listsync.py	[deleted file]	patch \| blob \| blame \| history