+++ /dev/null
-import datetime
-import httplib
-from Queue import Queue
-import threading
-import sys
-import time
-
-from lib.log import log
-from lib.parsers import ArchivesParser
-
-
-class MultiListCrawler(object):
- def __init__(self, lists, conn, status_interval=30, commit_interval=500):
- self.lists = lists
- self.conn = conn
- self.status_interval = status_interval
- self.commit_interval = commit_interval
-
- self.queue = Queue()
- self.counter = 0
- self.counterlock = threading.RLock()
- self.stopevent = threading.Event()
-
- def crawl(self, full=False, month=None):
- # Each thread can independently run on one month, so we can get
- # a reasonable spread. Therefor, submit them as separate jobs
- # to the queue.
- for listid, listname in self.lists:
- if full:
- # Generate a sequence of everything to index
- for year in range(1997, datetime.datetime.now().year + 1):
- for month in range(1, 13):
- self.queue.put((listid, listname, year, month, -1))
- elif month:
- # Do one specific month
- pieces = month.split("-")
- if len(pieces) != 2:
- print("Month format is <y>-<m>, cannot parse '%s'" % month)
- sys.exit(1)
- try:
- pieces = [int(x) for x in pieces]
- except:
- print("Month format is <y>-<m>, cannot convert '%s' to integers" % month)
- sys.exit(1)
- self.queue.put((listid, listname, pieces[0], pieces[1], -1))
- else:
- # In incremental scan, we check the current month and the
- # previous one, but only for new messages.
- curs = self.conn.cursor()
- curr = datetime.date.today()
- if curr.month == 1:
- prev = datetime.date(curr.year - 1, 12, 1)
- else:
- prev = datetime.date(curr.year, curr.month - 1, 1)
-
- for d in curr, prev:
- # Figure out what the highest indexed page in this
- # month is.
- curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", {
- 'list': listid,
- 'year': d.year,
- 'month': d.month,
- })
- x = curs.fetchall()
- if x[0][0] is not None:
- maxmsg = x[0][0]
- else:
- maxmsg = -1
- self.queue.put((listid, listname, d.year, d.month, maxmsg))
-
- for x in range(5):
- t = threading.Thread(name="Indexer %s" % x,
- target=lambda: self.crawl_from_queue())
- t.daemon = True
- t.start()
-
- t = threading.Thread(name="statusthread", target=lambda: self.status_thread())
- t.daemon = True
- t.start()
-
- # XXX: need to find a way to deal with all threads crashed and
- # not done here yet!
- self.queue.join()
- self.stopevent.set()
-
- return self.counter
-
- def status_thread(self):
- lastcommit = 0
- starttime = time.time()
- while not self.stopevent.is_set():
- self.stopevent.wait(self.status_interval)
- nowtime = time.time()
- with self.counterlock:
- log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % (
- self.counter,
- threading.active_count() - 2, # main thread + status thread
- self.queue.qsize(),
- self.counter / (nowtime - starttime),
- ))
- # Commit every 500 messages
- if self.counter - lastcommit > self.commit_interval:
- lastcommit = self.counter
- self.conn.commit()
-
- def crawl_from_queue(self):
- while not self.stopevent.is_set():
- (listid, listname, year, month, maxmsg) = self.queue.get()
- self.crawl_month(listid, listname, year, month, maxmsg)
- self.queue.task_done()
-
- def crawl_month(self, listid, listname, year, month, maxmsg):
- currentmsg = maxmsg
- while True:
- currentmsg += 1
- try:
- if not self.crawl_single_message(listid, listname, year, month, currentmsg):
- break
- except Exception as e:
- log("Exception when crawling %s/%s/%s/%s - %s" % (
- listname, year, month, currentmsg, e))
- # Continue on to try the next message
-
- def crawl_single_message(self, listid, listname, year, month, msgnum):
- curs = self.conn.cursor()
- h = httplib.HTTPConnection(host="archives.postgresql.org",
- port=80,
- strict=True,
- timeout=10)
- url = "/%s/%04d-%02d/msg%05d.php" % (
- listname,
- year,
- month,
- msgnum)
- h.putrequest("GET", url)
- h.putheader("User-agent", "pgsearch/0.2")
- h.putheader("Connection", "close")
- h.endheaders()
- resp = h.getresponse()
- txt = resp.read()
- h.close()
-
- if resp.status == 404:
- # Past the end of the month
- return False
- elif resp.status != 200:
- raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, resp.status))
-
- # Else we have the message!
- p = ArchivesParser()
- if not p.parse(txt):
- log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum))
- # We return true to move on to the next message anyway!
- return True
- curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", {
- 'listid': listid,
- 'year': year,
- 'month': month,
- 'msgnum': msgnum,
- 'date': p.date,
- 'subject': p.subject[:127],
- 'author': p.author[:127],
- 'txt': p.body,
- })
- with self.counterlock:
- self.counter += 1
-
- return True
return self.pagedata.read()
-class ArchivesParser(object):
- rematcher = re.compile("<!--X-Subject: ([^\n]*) -->.*<!--X-From-R13: ([^\n]*) -->.*<!--X-Date: ([^\n]*) -->.*<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->", re.DOTALL)
- hp = HTMLParser()
-
- def __init__(self):
- self.subject = None
- self.author = None
- self.date = None
- self.body = None
-
- def parse(self, contents):
- contents = lossy_unicode(contents)
- match = self.rematcher.search(contents)
- if not match:
- return False
- self.subject = self.hp.unescape(match.group(1))
- self.author = self.almost_rot13(self.hp.unescape(match.group(2)))
- if not self.parse_date(self.hp.unescape(match.group(3))):
- return False
- self.body = self.hp.unescape(match.group(4))
- return True
-
- _date_multi_re = re.compile(' \((\w+\s\w+|)\)$')
- _date_trailing_envelope = re.compile('\s+\(envelope.*\)$')
-
- def parse_date(self, d):
- # For some reason, we have dates that look like this:
- # http://archives.postgresql.org/pgsql-bugs/1999-05/msg00018.php
- # Looks like an mhonarc bug, but let's just remove that trailing
- # stuff here to be sure...
- if self._date_trailing_envelope.search(d):
- d = self._date_trailing_envelope.sub('', d)
-
- # We have a number of dates in the format
- # "<full datespace> +0200 (MET DST)"
- # or similar. The problem coming from the space within the
- # parenthesis, or if the contents of the parenthesis is
- # completely empty
- if self._date_multi_re.search(d):
- d = self._date_multi_re.sub('', d)
- # Isn't it wonderful with a string with a trailing quote but no
- # leading quote? MUA's are weird...
- if d.endswith('"') and not d.startswith('"'):
- d = d[:-1]
-
- # We also have "known incorrect timezone specs".
- if d.endswith('MST7MDT'):
- d = d[:-4]
- elif d.endswith('METDST'):
- d = d[:-3]
- elif d.endswith('"MET'):
- d = d[:-4] + "MET"
-
- try:
- self.date = dateutil.parser.parse(d)
- except ValueError:
- log("Failed to parse date '%s'" % d)
- return False
-
- if self.date.utcoffset():
- # We have some messages with completely incorrect utc offsets,
- # so we need to reject those too
- if self.date.utcoffset() > timedelta(hours=12) or self.date.utcoffset() < timedelta(hours=-12):
- log("Failed to parse date %s', timezone offset out of range." % d)
- return False
-
- return True
-
- # Semi-hacked rot13, because the one used by mhonarc is broken.
- # So we copy the brokenness here.
- # This code is from MHonArc/ewhutil.pl, mrot13()
- _arot13_trans = dict(list(zip(list(map(ord,
- '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[abcdefghijklmnopqrstuvwxyz')),
- 'NOPQRSTUVWXYZ[@ABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm')))
-
- def almost_rot13(self, s):
- return str(s).translate(self._arot13_trans)
-
-
class RobotsParser(object):
def __init__(self, url):
try:
+++ /dev/null
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from lib.log import log
-from lib.archives import MultiListCrawler
-from lib.threadwrapper import threadwrapper
-from ConfigParser import ConfigParser
-from optparse import OptionParser
-import psycopg2
-import sys
-import time
-
-
-def doit(opt):
- cp = ConfigParser()
- cp.read("search.ini")
- psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
- conn = psycopg2.connect(cp.get("search", "db"))
-
- curs = conn.cursor()
-
- if opt.list:
- # Multiple lists can be specified with a comma separator (no spaces)
- curs.execute("SELECT id,name FROM lists WHERE name=ANY(%(names)s)", {
- 'names': opt.list.split(','),
- })
- else:
- curs.execute("SELECT id,name FROM lists WHERE active ORDER BY id")
-
- listinfo = [(id, name) for id, name in curs.fetchall()]
- c = MultiListCrawler(listinfo, conn, opt.status_interval, opt.commit_interval)
- n = c.crawl(opt.full, opt.month)
-
- # Update total counts
- curs.execute("WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list")
- # Indicate when we crawled
- curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP")
- conn.commit()
-
- log("Indexed %s messages" % n)
- time.sleep(1)
-
-
-if __name__ == "__main__":
- parser = OptionParser()
- parser.add_option("-l", "--list", dest='list', help="Crawl only this list")
- parser.add_option("-m", "--month", dest='month', help="Crawl only this month")
- parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl")
- parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates")
- parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit")
-
- (opt, args) = parser.parse_args()
-
- if opt.full and opt.month:
- print("Can't use both full and specific month!")
- sys.exit(1)
-
- # assign default values
- opt.status_interval = opt.status_interval and int(opt.status_interval) or 30
- opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500
-
- threadwrapper(doit, opt)
+++ /dev/null
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from lib.log import log
-from ConfigParser import ConfigParser
-import psycopg2
-import urllib
-import simplejson as json
-
-if __name__ == "__main__":
- cp = ConfigParser()
- cp.read("search.ini")
- psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
- conn = psycopg2.connect(cp.get("search", "db"))
- curs = conn.cursor()
-
- u = urllib.urlopen("http://%s/community/lists/listinfo/" % cp.get("search", "web"))
- obj = json.load(u)
- u.close()
-
- # We don't care about the groups here, just the lists!
- curs.execute("SELECT id, name, grp, active FROM lists")
- lists = curs.fetchall()
- for id, name, groupid, active in lists:
- thislist = [x for x in obj['lists'] if x['id'] == id]
- if len(thislist) == 0:
- log("List %s should be removed, do that manually!" % name)
- else:
- # Compare contents of list
- l = thislist[0]
- if l['name'] != name:
- log("Renaming list %s -> %s" % (name, l['name']))
- curs.execute("UPDATE lists SET name=%(name)s WHERE id=%(id)s", l)
-
- if thislist[0]['active'] != active:
- log("Changing active flag for %s to %s" % (l['name'], l['active']))
- curs.execute("UPDATE lists SET active=%(active)s WHERE id=%(id)s", l)
- if thislist[0]['groupid'] != groupid:
- log("Changing group for %s to %s" % (l['name'], l['groupid']))
- curs.execute("UPDATE lists SET grp=%(groupid)s WHERE id=%(id)s", l)
-
- for l in obj['lists']:
- thislist = [x for x in lists if x[0] == l['id']]
- if len(thislist) == 0:
- log("Adding list %s" % l['name'])
- curs.execute("INSERT INTO lists (id, name, grp, active, pagecount) VALUES (%(id)s, %(name)s, %(groupid)s, %(active)s, 0)",
- l)
-
- conn.commit()