Reimplement list searching on top of http API
authorMagnus Hagander <magnus@hagander.net>
Sat, 12 Jan 2013 16:48:09 +0000 (17:48 +0100)
committerMagnus Hagander <magnus@hagander.net>
Sat, 12 Jan 2013 16:48:09 +0000 (17:48 +0100)
The new archives has a http api - use that one for searching instead
of directly talking to the database.

With the new API, we always fetch the complete search results (still
capped server-side at 1000 items), and store them locally in memcached
for 10 minutes. That way, paging will only hit the local memcached and
not the remote http api *or* the SQL api.

pgweb/search/views.py
pgweb/settings.py
templates/search/listsearch.html

index 0390f147c611e4c86bd58c5e66b00ee80bb994b2..30d7aa8695f33d1f42a05dc00319f2e60d20da56 100644 (file)
@@ -6,11 +6,20 @@ from django.conf import settings
 from pgweb.util.decorators import cache
 
 import datetime
+import httplib
 import urllib
 import psycopg2
+import simplejson as json
 
 from lists.models import MailingList, MailingListGroup
 
+# Conditionally import memcached library. Everything will work without
+# it, so we allow development installs to run without it...
+try:
+       import pylibmc
+       has_memcached=True
+except:
+       has_memcached=False
 
 def generate_pagelinks(pagenum, totalpages, querystring):
        # Generate a list of links to page through a search result
@@ -80,10 +89,6 @@ def search(request):
 
                if not dateval:
                        dateval = 365
-               if dateval == -1:
-                       firstdate = None
-               else:
-                       firstdate = datetime.datetime.today()-datetime.timedelta(days=dateval)
 
                sortoptions = (
                        {'val':'r', 'text': 'Rank', 'selected': not (request.REQUEST.has_key('s') and request.REQUEST['s'] == 'd')},
@@ -143,34 +148,57 @@ def search(request):
 
        firsthit = (pagenum - 1) * hitsperpage + 1
 
-       # Get ourselves a connection
-       try:
-               conn = psycopg2.connect(settings.SEARCH_DSN)
-               curs = conn.cursor()
-       except:
-               return render_to_response('search/sitesearch.html', {
-                               'search_error': 'Could not connect to search database.'
-                               })
-
        if searchlists:
-               # perform the query for list archives search
-               curs.execute("SELECT * from archives_search(%(query)s, %(listid)s, %(firstdate)s, NULL, %(firsthit)s, %(hitsperpage)s, %(sort)s)", {
-                               'query': query,
-                               'firsthit': firsthit - 1,
-                               'hitsperpage': hitsperpage,
-                               'listid': listid,
-                               'firstdate': firstdate,
-                               'sort': listsort,
-                               })
-               hits = curs.fetchall()
-               conn.close()
-               totalhits = int(hits[-1][1])
+               # Lists are searched by passing the work down using a http
+               # API. In the future, we probably want to do everything
+               # through a http API and merge hits, but that's for later
+               p = {
+                       'q': query,
+                       's': listsort,
+                       }
+               if listid:
+                       if listid < 0:
+                               # This is a list group, we expand that on the web server
+                               p['l'] = ','.join([str(x.id) for x in MailingList.objects.filter(group=-listid)])
+                       else:
+                               p['l'] = listid
+               if dateval:
+                       p['d'] = dateval
+               urlstr = urllib.urlencode(p)
+               # If memcached is available, let's try it
+               hits = None
+               if has_memcached:
+                       memc = pylibmc.Client(['127.0.0.1',], binary=True, behaviors={'tcp_nodelay':True})
+                       try:
+                               hits = memc.get(urlstr)
+                       except Exception, e:
+                               # If we had an exception, don't try to store either
+                               memc = None
+               if not hits:
+                       # No hits found - so try to get them from the search server
+                       c = httplib.HTTPConnection(settings.ARCHIVES_SEARCH_SERVER, strict=True, timeout=5)
+                       c.request('POST', '/archives-search/', urlstr)
+                       r = c.getresponse()
+                       if r.status != 200:
+                               memc = None
+                               return render_to_response('search/listsearch.html', {
+                                               'search_error': 'Error talking to search server: %s' % r.reason,
+                                               })
+                       hits = json.loads(r.read())
+                       if has_memcached and memc:
+                               # Store them in memcached too! But only for 10 minutes...
+                               # And always compress it, just because we can
+                               memc.set(urlstr, hits, 60*10, 1)
+                               memc = None
+
+               totalhits = len(hits)
                querystr = "?m=1&q=%s&l=%s&d=%s&s=%s" % (
                        urllib.quote_plus(query.encode('utf-8')),
                        listid or '',
                        dateval,
                        listsort
                        )
+
                return render_to_response('search/listsearch.html', {
                                'hitcount': totalhits,
                                'firsthit': firsthit,
@@ -181,23 +209,32 @@ def search(request):
                                                                           totalhits / hitsperpage + 1,
                                                                           querystr)),
                                'hits': [{
-                                               'list': h[0],
-                                               'year': h[1],
-                                               'month': "%02d" % h[2],
-                                               'msgnum': "%05d" % h[3],
-                                               'date': h[4],
-                                               'subject': h[5],
-                                               'author': h[6],
-                                               'abstract': h[7].replace("[[[[[[", "<b>").replace("]]]]]]","</b>"),
-                                               'rank': h[8],
-                                               } for h in hits[:-1]],
+                                               'list': h['l'],
+                                               'date': h['d'],
+                                               'subject': h['s'],
+                                               'author': h['f'],
+                                               'messageid': h['m'],
+                                               'abstract': h['a'],
+                                               'rank': h['r'],
+                                               } for h in hits[firsthit-1:firsthit+hitsperpage-1]],
                                'sortoptions': sortoptions,
                                'lists': MailingList.objects.all().order_by("group__sortkey"),
                                'listid': listid,
                                'dates': dateoptions,
                                'dateval': dateval,
                                })
+
        else:
+               # Website search is still done by making a regular pgsql connection
+               # to the search server.
+               try:
+                       conn = psycopg2.connect(settings.SEARCH_DSN)
+                       curs = conn.cursor()
+               except:
+                       return render_to_response('search/sitesearch.html', {
+                                       'search_error': 'Could not connect to search database.'
+                                       })
+
                # perform the query for general web search
                curs.execute("SELECT * FROM site_search(%(query)s, %(firsthit)s, %(hitsperpage)s, %(allsites)s, %(suburl)s)", {
                                'query': query,
index 1596d6bdd07ea76a101fbb2fa06cbf9b3c7ea508..c4e69589e1bedeb915ba2a8ddd95d65c8aa7eebf 100644 (file)
@@ -134,6 +134,7 @@ FRONTEND_SERVERS=()                                    # A tuple containing the
 FTP_MASTERS=()                                                                            # A tuple containing the *IP addresses* of all machines
                                                        # trusted to upload ftp structure data
 VARNISH_PURGERS=()                                     # Extra servers that can do varnish purges through our queue
+ARCHIVES_SEARCH_SERVER="archives.postgresql.org"       # Where to post REST request for archives search
 
 # Load local settings overrides
 from settings_local import *
index 64aa51f68bd726cc92b7315074aea4cdbb80d8f1..27a0879168c438919943b72a6b5d726d406a1f58 100644 (file)
    <h2>Results {{firsthit}}-{{lasthit}} of {%if hitcount = 1000%}more than 1000{%else%}{{hitcount}}{%endif%}.</h2>
    {%if pagelinks %}Result pages: {{pagelinks|safe}}<br/><br/>{%endif%}
    {%for hit in hits %}
-    {{forloop.counter0|add:firsthit}}. <a href="http://archives.postgresql.org/{{hit.list}}/{{hit.year}}-{{hit.month}}/msg{{hit.msgnum}}.php">{{hit.subject}}</a> [{{hit.rank|floatformat:2}}]<br/>
+    {{forloop.counter0|add:firsthit}}. <a href="http://archives.postgresql.org/message-id/{{hit.messageid}}">{{hit.subject}}</a> [{{hit.rank|floatformat:2}}]<br/>
+    From {{hit.author}} on {{hit.date}}.<br/>
 {{hit.abstract|safe}}<br/>
-<a href="http://archives.postgresql.org/{{hit.list}}/{{hit.year}}-{{hit.month}}/msg{{hit.msgnum}}.php">http://archives.postgresql.org/{{hit.list}}/{{hit.year}}-{{hit.month}}/msg{{hit.msgnum}}.php</a><br/>
+<a href="http://archives.postgresql.org/message-id/{{hit.messageid}}">http://archives.postgresql.org/message-id/{{hit.messageid}}</a><br/>
     <br/>
    {%endfor%}
    {%if pagelinks %}Result pages: {{pagelinks|safe}}<br/><br/>{%endif%}